diff --git a/.appveyor.yml b/.appveyor.yml
new file mode 100644
index 0000000000..6f87c22b13
--- /dev/null
+++ b/.appveyor.yml
@@ -0,0 +1,58 @@
+version: '1.0.{build}'  # This number doesn't matter
+
+pull_requests:
+  do_not_increment_build_number: true
+
+platform:
+  - x64
+
+image: Visual Studio 2015
+
+clone_folder: C:\projects\libgpuarray
+
+configuration:
+  - Release
+
+environment:
+  BINSTAR_TOKEN:
+    secure: 58KqJcKtfCBVCuIzpnkLm4XZLQqKq95Hs8Ly20HWaMSla67nusrp3y4sy6XzZOBQ
+
+  matrix:
+    - CONDA_LOC: "C:\\Miniconda-x64"
+      PATCH_VS2008: "1"
+    - CONDA_LOC: "C:\\Miniconda35-x64"
+      PATCH_VS2008: "0"
+    - CONDA_LOC: "C:\\Miniconda36-x64"
+      PATCH_VS2008: "0"
+
+install:
+  # This breaks conda-build because of git
+  - cmd: rmdir C:\cygwin /s /q
+  - cmd: call %CONDA_LOC%\Scripts\activate.bat
+  - cmd: set PYTHONUNBUFFERED=1
+  - cmd: conda install -n root --yes conda conda-env conda-build anaconda-client
+  # We borrow a trick from conda-forge to fix the VS2008 compiler
+  - ps: |
+      if($env:PATCH_VS2008 -eq '1') {
+        cmd /c "conda config --append channels conda-forge 2>&1"
+        cmd /c "conda install --yes vs2008_express_vc_python_patch 2>&1"
+        cmd /c "call setup_x64 2>&1"
+      }
+
+build: off
+
+test_script:
+  - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i
+  - cmd: echo %GPUARRAY_VERSION%
+  - cmd: conda build conda
+  - cmd: mkdir pkgs
+  - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\pygpu* pkgs\ /Y
+  - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y
+  - ps: |
+      if($env:appveyor_repo_tag -eq 'True') {
+        cmd /c "anaconda -t $env:BINSTAR_TOKEN upload --user=mila-udem pkgs/* 2>&1"
+      }
+
+artifacts:
+  - path: pkgs/*
+    name: "Conda Packages"
diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 0000000000..09f05ff1ca
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,43 @@
+version: 2
+
+jobs:
+  build_pkgs:
+    docker:
+      - image: joaander/conda-build:20170905
+
+    steps:
+      - checkout
+      - run:
+          name: "Checkout Merge Commit"
+          command: |
+            if [[ -n "${CIRCLE_PR_NUMBER}" ]]
+            then
+                git fetch -u origin "+refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge"
+                git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge"
+            fi
+      - run:
+          name: "Build Recipe"
+          command: |
+            export GPUARRAY_VERSION=`python -c 'import versioneer; print(versioneer.get_version())'`
+            conda build --python 2.7 conda
+            conda build --python 3.5 conda/pygpu
+            conda build --python 3.6 conda/pygpu
+      - run:
+          name: "Upload Tagged Versions"
+          command: |
+            if [[ -n "${CIRCLE_TAG}" ]]
+            then
+                anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /opt/conda/conda-bld/linux-64/libgpuarray*
+                anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /opt/conda/conda-bld/linux-64/pygpu*
+            fi
+      - store_artifacts:
+          path: /opt/conda/conda-bld/linux-64
+
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      - build_pkgs:
+          filters:
+            tags:
+              only: /.*/
diff --git a/.clean b/.clean
new file mode 100644
index 0000000000..1c90c15b70
--- /dev/null
+++ b/.clean
@@ -0,0 +1,21 @@
+Build
+build
+Debug
+Release
+lib
+__pycache__
+.idea
+.*.sw[po]
+*~
+*.pyc
+*.pyd
+*.pyo
+*.egg-info
+dist
+setuptools*egg
+setuptools.pth
+distribute*egg
+distribute*tar.gz
+*.so
+*.o
+*.log
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..3f619f34d6
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+pygpu/_version.py export-subst
diff --git a/.gitignore b/.gitignore
index 4261cf0d30..143674ab00 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,13 @@
+Build
 build
 Debug
 Release
 lib
+.idea
 .*.sw[po]
 *~
 *.pyc
+*.pyd
 *.pyo
 *.egg-info
 MANIFEST
@@ -15,11 +18,12 @@ distribute*egg
 distribute*tar.gz
 *.so
 *.o
-*.aux
-*.bbl
-*.blg
 *.log
+doc/_build
+doc/_doxybuild
 pygpu/*.c
 pygpu/*.h
+pygpu/version.py
+src/gpuarray/abi_version.h
 src/private_config.h
 Makefile.conf
diff --git a/.jenkins-pr.sh b/.jenkins-pr.sh
new file mode 100755
index 0000000000..4a09da6ed4
--- /dev/null
+++ b/.jenkins-pr.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Script for Jenkins continuous integration testing of libgpuarray
+
+# Print commands as they are executed
+set -x
+
+# Anaconda python
+export PATH=/usr/local/miniconda2/bin:$PATH
+
+# CUDA
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
+
+# Can also set to "Debug", "Release" to go faster
+: ${GPUARRAY_CONFIG:="Release"}
+# Set these to " " to disable (empty doesn't work)
+: ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1"
+: ${DEVICES_OPENCL:=" "}
+
+git rev-parse HEAD
+
+# Build libgpuarray and run C tests
+mkdir build
+(cd build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} && make)
+
+# Test on different devices
+for dev in ${DEVICES_CUDA}; do
+    echo "Testing libgpuarray for DEVICE=${dev}"
+    (cd build && DEVICE=${dev} make test)
+done
+for dev in ${DEVICES_OPENCL}; do
+    echo "Testing libgpuarray for DEVICE=${dev}"
+    (cd build && DEVICE=${dev} make test)
+done
+
+export LD_LIBRARY_PATH=`pwd`/lib:${LD_LIBRARY_PATH}
+export LIBRARY_PATH=`pwd`/lib:${LIBRARY_PATH}
+export CPATH=`pwd`/src:${CPATH}
+
+# Build the pygpu modules
+python setup.py build_ext --inplace
+
+# Test it
+test=pygpu
+for dev in ${DEVICES_CUDA}; do
+    echo "Testing pygpu for DEVICE=${dev}"
+    DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests
+done
+for dev in ${DEVICES_OPENCL}; do
+    echo "Testing pygpu for DEVICE=${dev}"
+    DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests -e test_blas.py
+done
diff --git a/.jenkins_pr_mac.sh b/.jenkins_pr_mac.sh
new file mode 100755
index 0000000000..b927f567d5
--- /dev/null
+++ b/.jenkins_pr_mac.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Script for Jenkins continuous integration testing of libgpuarray on mac
+
+# Print commands as they are executed
+set -x
+
+# Set path for conda and cmake
+export PATH="/Users/jenkins/miniconda2/bin:/usr/local/bin:$PATH"
+
+# CUDA
+export PATH=/usr/local/cuda/bin:${PATH}
+export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:${DYLD_LIBRARY_PATH}
+export CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
+
+# Can also set to "Debug", "Release" to go faster
+: ${GPUARRAY_CONFIG:="Release"}
+# Set these to " " to disable (empty doesn't work)
+: ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1"
+: ${DEVICES_OPENCL:=" "}
+
+git rev-parse HEAD
+
+# Build libgpuarray and run C tests
+rm -rf build lib
+mkdir build
+(cd build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} && make)
+
+# Test on different devices
+for dev in ${DEVICES_CUDA}; do
+    echo "Testing libgpuarray for DEVICE=${dev}"
+    (cd build && DEVICE=${dev} make test)
+done
+for dev in ${DEVICES_OPENCL}; do
+    echo "Testing libgpuarray for DEVICE=${dev}"
+    (cd build && DEVICE=${dev} make test)
+done
+
+export PYTHONPATH=`pwd`/lib/python:$PYTHONPATH
+export DYLD_LIBRARY_PATH=`pwd`/lib:${DYLD_LIBRARY_PATH}
+export CPLUS_INCLUDE_PATH=`pwd`/src:${CPLUS_INCLUDE_PATH}
+
+# Build the pygpu modules
+python setup.py build_ext --inplace -I`pwd`/src -L`pwd`/lib
+
+# Test it
+test=pygpu_pr_mac
+for dev in ${DEVICES_CUDA}; do
+    echo "Testing pygpu for DEVICE=${dev}"
+    DEVICE=${dev} nosetests --with-xunit --xunit-file=${test}_${dev}tests.xml pygpu/tests
+done
+for dev in ${DEVICES_OPENCL}; do
+    echo "Testing pygpu for DEVICE=${dev}"
+    DEVICE=${dev} nosetests --with-xunit --xunit-file=${test}_${dev}tests.xml pygpu/tests -e test_blas.py
+done
diff --git a/.jenkins_pr_win.bat b/.jenkins_pr_win.bat
new file mode 100644
index 0000000000..4a3a4477d0
--- /dev/null
+++ b/.jenkins_pr_win.bat
@@ -0,0 +1,45 @@
+REM Set path for cuda, conda python and cmake
+REM Set conda python, cudnn, cmake path
+set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\ProgramData\Miniconda2\Library\usr\bin;C:\ProgramData\Miniconda2\Library\bin;C:\ProgramData\Miniconda2\Scripts
+set PATH=%PATH%;%CUDNNPATH%\bin;C:\Program Files\CMake\bin
+
+REM Can also set to "Debug", "Release" to go faster
+set GPUARRAY_CONFIG="Release"
+REM Use spaces to seperate devices
+set DEVICES_CUDA=cuda
+set DEVICES_OPENCL=
+
+git rev-parse HEAD
+
+REM Clean up previous installs (to make sure no old files are left)
+rmdir %WORKSPACE%\lib /s/q
+mkdir %WORKSPACE%\lib
+rmdir build /s/q
+mkdir build
+
+REM Build libgpuarray and run C tests
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=%GPUARRAY_CONFIG% -G "NMake Makefiles"
+nmake
+cd ..
+
+set PATH=%PATH%;%WORKSPACE%\lib
+
+REM Add conda gcc toolchain path
+set PATH=%PATH%;C:\ProgramData\Miniconda2\Library\mingw-w64\bin
+
+REM Build the pygpu modules
+python setup.py build_ext --inplace
+
+REM Test pygpu
+set test=pygpu
+for %%d in (%DEVICES_CUDA%) do (
+    echo "Testing pygpu for DEVICE=%%d"
+    set DEVICE=%%d
+	nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests
+)
+for %%d in (%DEVICES_OPENCL%) do (
+    echo "Testing pygpu for DEVICE=%%d"
+    set DEVICE=%%d
+    nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests -e test_blas.py
+)
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000..635c1ba107
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,28 @@
+language: c
+
+matrix:
+  include:
+    - os: osx
+      compiler: clang
+
+before_install:
+  - export PREFIX=$HOME/.local
+  - brew update && brew install doxygen
+  - export PYTHONUSERBASE=$PREFIX
+  - pip2 install --user breathe sphinx==1.6.3 sphinx_rtd_theme cython numpy 'mako>=0.7' six
+  - export PATH=$PATH:$PREFIX/bin
+  - export CPATH=$CPATH:$PREFIX/include
+  - export LIBRARY_PATH=$LIBRARY_PATH:$PREFIX/lib
+  - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PREFIX/lib
+
+# Build with Debug and Release to flush out build problems
+script:
+  - mkdir Debug
+  - (cd Debug && cmake .. -DCMAKE_BUILD_TYPE=Debug && make)
+  - mkdir Release
+  - (cd Release &&  cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX && make && make install)
+  - python setup.py build_ext --inplace
+  - (cd doc && make html)
+
+# Do not treat "shell_session_update: command not found" on MacOS as a failure
+after_script: set +e
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6ec583a54..8b887fb844 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,13 @@ PROJECT(libgpuarray C)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/")
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+# -Wall is unbelieveably noisy with Visual Studio:
+# https://stackoverflow.com/q/4001736/3257826
+if(MSVC)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3")
+else()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unused-parameter -Werror=format-security -Wdeclaration-after-statement -std=gnu89")
+endif()
 
 enable_testing()
 
@@ -22,3 +28,12 @@ endif()
 
 add_subdirectory(src)
 add_subdirectory(tests)
+
+# uninstall target
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
+    IMMEDIATE @ONLY)
+
+add_custom_target(uninstall
+    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
diff --git a/CMakeModules/FindCLBlast.cmake b/CMakeModules/FindCLBlast.cmake
new file mode 100644
index 0000000000..2ab0f54033
--- /dev/null
+++ b/CMakeModules/FindCLBlast.cmake
@@ -0,0 +1,35 @@
+# - Try to find CLBlast
+#  Once done this will define
+#
+#  CLBLAST_FOUND - system has CLBlast
+#  CLBLAST_INCLUDE_DIRS - location of CLBlast.h
+#  CLBLAST_LIBRARIES - location of libCLBlast
+
+IF(CLBLAST_INCLUDE_DIRS)
+  # Already in cache, be silent
+  set (CLBLAST_FIND_QUIETLY TRUE)
+ENDIF (CLBLAST_INCLUDE_DIRS)
+
+FIND_PATH(CLBLAST_ROOT_DIR
+    NAMES include/clblast_c.h
+    HINTS /usr/local/ $ENV{CLBLAST_ROOT}
+    DOC "CLBlast root directory.")
+
+FIND_PATH(_CLBLAST_INCLUDE_DIRS
+    NAMES clblast_c.h
+    HINTS ${CLBLAST_ROOT_DIR}/include
+    DOC "CLBlast Include directory")
+
+FIND_LIBRARY(_CLBLAST_LIBRARY
+	NAMES libclblast.so
+    HINTS ${CLBLAST_ROOT_DIR}/lib ${CLBLAST_ROOT_DIR}/lib64 ${CLBLAST_ROOT_DIR}/lib32
+    DOC "CLBlast lib directory")
+
+SET(CLBLAST_INCLUDE_DIRS ${_CLBLAST_INCLUDE_DIRS})
+SET(CLBLAST_LIBRARIES ${_CLBLAST_LIBRARY})
+
+# handle the QUIETLY and REQUIRED arguments and set CLBLAST_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE (FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(CLBLAST DEFAULT_MSG CLBLAST_LIBRARIES CLBLAST_INCLUDE_DIRS)
+MARK_AS_ADVANCED(CLBLAST_LIBRARIES CLBLAST_INCLUDE_DIRS)
diff --git a/INSTALL b/INSTALL
index 469d6da210..8b39a672dc 100644
--- a/INSTALL
+++ b/INSTALL
@@ -5,7 +5,7 @@ with a log of the build messages to abergeron@gmail.com.
 Requirements:
 
  - either an OpenCL runtime (with headers) or the CUDA toolkit
- - CMake [ http://cmake.org ] (to build)
+ - CMake [ https://cmake.org ] (to build)
 
 Run CMake on the CMakeList.txt file in src/ and build according to
 your platform.  Set CMAKE_INSTALL_PREFIX to your desired path if you
@@ -21,4 +21,4 @@ There are instruction for installation in the CMake file which make
 install on Windows.
 
 If you also want the python bindings, run 'python setup.py install'
-after building and installing the library which will install pygpu.
\ No newline at end of file
+after building and installing the library which will install pygpu.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000..4681f6b8ab
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include versioneer.py
+include pygpu/_version.py
diff --git a/Makefile b/Makefile
index 6d9ebdd85c..915254727b 100644
--- a/Makefile
+++ b/Makefile
@@ -11,10 +11,8 @@ debug: install-debugc py
 
 .PHONY: install-debugc py debug install-relc rel config
 
-Debug:
-	mkdir Debug
-
-Debug/Makefile: Debug config
+Debug/Makefile: Makefile.conf
+	mkdir -p Debug
 ifndef INSTALL_PREFIX
 	(cd Debug && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Debug)
 else
@@ -34,10 +32,8 @@ endif
 install-debugc: debugc
 	(cd Debug && ${SUDO} make install)
 
-Release:
-	mkdir Release
-
-Release/Makefile: Release config
+Release/Makefile: Makefile.conf
+	mkdir -p Release
 ifndef INSTALL_PREFIX
 	(cd Release && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Release)
 else
@@ -57,5 +53,5 @@ endif
 install-relc: relc
 	(cd Release && ${SUDO} make install)
 
-py: config
+py: Makefile.conf
 	python setup.py build_ext --inplace
diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache
new file mode 100644
index 0000000000..3cfb429a76
--- /dev/null
+++ b/bin/gpuarray-cache
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+import os
+import sys
+
+def clean(max_size, path):
+    content = []
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            fpath = os.path.join(root, file)
+            st = os.stat(fpath)
+            content.append((st.st_atime, st.st_size, fpath))
+
+    content.sort()
+    cur_size = 0
+    for _, size, path in content:
+        cur_size += size
+        if cur_size > max_size:
+            os.remove(path)
+
+
+SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40,
+            'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80}
+
+
+def get_size(s):
+    i = 0
+    s = s.strip()
+    if s[-1].upper() in SUFFIXES:
+        num = s[:-1]
+        suf = s[-1].upper()
+    else:
+        num = s
+        suf = ""
+    num = float(num)
+    if suf != "":
+        mult = SUFFIXES[suf]
+    else:
+        mult = 1
+    return int(num * mult)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility')
+    parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)')
+    args = parser.parse_args()
+    path = os.environ.get('GPUARRAY_CACHE_PATH', None)
+    if path is None:
+        print("You need to set GPUARRAY_CACHE_PATH so that this programs knows which path to clean.")
+        sys.exit(1)
+
+    clean(get_size(args.max_size), path)
+
diff --git a/cmake_uninstall.cmake.in b/cmake_uninstall.cmake.in
new file mode 100644
index 0000000000..2037e36539
--- /dev/null
+++ b/cmake_uninstall.cmake.in
@@ -0,0 +1,21 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
diff --git a/conda/libgpuarray/bld.bat b/conda/libgpuarray/bld.bat
new file mode 100644
index 0000000000..e4c7f38b35
--- /dev/null
+++ b/conda/libgpuarray/bld.bat
@@ -0,0 +1,10 @@
+cmake -G "%CMAKE_GENERATOR%" ^
+      -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^
+      -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^
+      -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^
+      "%SRC_DIR%"
+if errorlevel 1 exit 1
+cmake --build . --config Release --target ALL_BUILD
+if errorlevel 1 exit 1
+cmake --build . --config Release --target install
+if errorlevel 1 exit 1
diff --git a/conda/libgpuarray/build.sh b/conda/libgpuarray/build.sh
new file mode 100644
index 0000000000..7e2ea03787
--- /dev/null
+++ b/conda/libgpuarray/build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+if [[ $(uname) == Darwin ]]; then
+  cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_OSX_DEPLOYMENT_TARGET=
+else
+  cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX
+fi
+cmake --build . --config Release --target all
+cmake --build . --config Release --target install
diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml
new file mode 100644
index 0000000000..a3914e1b17
--- /dev/null
+++ b/conda/libgpuarray/meta.yaml
@@ -0,0 +1,31 @@
+package:
+  name: libgpuarray
+  version: {{ environ.get('GPUARRAY_VERSION') }}
+
+source:
+  path: ../../
+
+build:
+  number: 0
+  features:
+    - vc9     # [win and py27]
+    - vc10    # [win and py34]
+    - vc14    # [win and (py35 or py36)]
+
+requirements:
+  build:
+    - cmake
+    - mako
+    - python  # version doesn't matter here
+  run:
+    - vs2008_runtime  [win and py27]
+    - vs2010_runtime  [win and py34]
+    - vs2015_runtime  [win and (py35 or py36)]
+
+about:
+  home: http://github.com/Theano/libgpuarray
+  license: ISC
+  license_file: LICENSE
+  summary: 'Library to manipulate arrays on GPU'
+  doc_url: http://deeplearning.net/software/libgpuarray/
+  dev_url: http://github.com/Theano/libgpuarray
diff --git a/conda/pygpu/bld.bat b/conda/pygpu/bld.bat
new file mode 100644
index 0000000000..c20afe1dd4
--- /dev/null
+++ b/conda/pygpu/bld.bat
@@ -0,0 +1,3 @@
+set LIB=%LIBRARY_LIB%;%LIB%
+set INCLUDE=%LIBRARY_INC%;%INCLUDE%
+%PYTHON% setup.py install --single-version-externally-managed --record=record.txt
\ No newline at end of file
diff --git a/conda/pygpu/build.sh b/conda/pygpu/build.sh
new file mode 100644
index 0000000000..9a446aa728
--- /dev/null
+++ b/conda/pygpu/build.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+export CFLAGS=${CFLAGS}" -I${PREFIX}/include -L${PREFIX}/lib"
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt
\ No newline at end of file
diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml
new file mode 100644
index 0000000000..716783b22d
--- /dev/null
+++ b/conda/pygpu/meta.yaml
@@ -0,0 +1,36 @@
+{% set version = environ.get('GPUARRAY_VERSION') %}
+
+package:
+  name: pygpu
+  version: {{ version }}
+
+source:
+  path: ../../
+
+build:
+  number: 0
+  detect_binary_files_with_prefix: False
+
+requirements:
+  build:
+    - python
+    - cython >=0.25
+    - numpy 1.11
+    - mako
+    - setuptools
+    - libgpuarray =={{ version }}
+
+  run:
+    - python
+    - {{ pin_compatible('numpy', '1.11') }}
+    - mako
+    - six
+    - libgpuarray =={{ version }}
+
+about:
+  home: http://github.com/Theano/libgpuarray
+  license: ISC
+  license_file: LICENSE
+  summary: 'Library to manipulate arrays on GPU'
+  doc_url: http://deeplearning.net/software/libgpuarray/
+  dev_url: http://github.com/Theano/libgpuarray
diff --git a/doc/Doxyfile b/doc/Doxyfile
index 33e1ad541a..2de3d2331a 100644
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@@ -329,22 +329,6 @@ INLINE_SIMPLE_STRUCTS  = NO
 
 TYPEDEF_HIDES_STRUCT   = YES
 
-# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
-# determine which symbols to keep in memory and which to flush to disk.
-# When the cache is full, less often used symbols will be written to disk.
-# For small to medium size projects (<1000 input files) the default value is
-# probably good enough. For larger projects a too small cache size can cause
-# doxygen to be busy swapping symbols to and from disk most of the time
-# causing a significant performance penalty.
-# If the system has enough physical memory increasing the cache will improve the
-# performance by keeping more symbols in memory. Note that the value works on
-# a logarithmic scale so increasing the size by one will roughly double the
-# memory usage. The cache size is given by this formula:
-# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
-# corresponding to a cache size of 2^16 = 65536 symbols.
-
-SYMBOL_CACHE_SIZE      = 0
-
 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
 # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
 # their name and scope. Since this can be an expensive process and often the
@@ -365,7 +349,7 @@ LOOKUP_CACHE_SIZE      = 0
 # Private class members and static file members will be hidden unless
 # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
 
-EXTRACT_ALL            = NO
+EXTRACT_ALL            = YES
 
 # If the EXTRACT_PRIVATE tag is set to YES all private members of a class
 # will be included in the documentation.
@@ -700,7 +684,9 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = ../src/gpuarray/wincompat ../src/gpuarray/compat.h
+EXCLUDE                = ../src/gpuarray/wincompat \
+                         ../src/gpuarray/compat.h \
+                         ../src/private_config.h
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -879,7 +865,7 @@ IGNORE_PREFIX          =
 # If the GENERATE_HTML tag is set to YES (the default) Doxygen will
 # generate HTML output.
 
-GENERATE_HTML          = YES
+GENERATE_HTML          = NO
 
 # The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
 # If a relative path is entered the value of OUTPUT_DIRECTORY will be
@@ -1471,18 +1457,6 @@ GENERATE_XML           = YES
 
 XML_OUTPUT             = xml
 
-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
diff --git a/doc/_static/fix_rtd.css b/doc/_static/fix_rtd.css
new file mode 100644
index 0000000000..7ba86db2ad
--- /dev/null
+++ b/doc/_static/fix_rtd.css
@@ -0,0 +1,11 @@
+/* work around https://github.com/snide/sphinx_rtd_theme/issues/149 */
+.rst-content table.field-list .field-body {
+    padding-top: 8px;
+}
+.rst-versions-up {
+    cursor: pointer;
+    display: inline;
+}
+.wy-side-nav-search>div.version {
+    color: white;
+}
\ No newline at end of file
diff --git a/doc/_static/version_switch.js b/doc/_static/version_switch.js
new file mode 100644
index 0000000000..3c3685a456
--- /dev/null
+++ b/doc/_static/version_switch.js
@@ -0,0 +1,104 @@
+// Create version selector for documentation top bar.
+(function() {
+
+  var url = window.location.href;
+  var base_dir = 'libgpuarray'; // directory containing doc
+  // Default libgpuarray version: release and development.
+  var versions_dir = {"release": "libgpuarray", "dev": "libgpuarray_versions/dev"};
+
+  // If doc is run localy
+  if (url.startsWith('file')) {
+    base_dir = 'html';
+    versions_dir = {"local":"html", "test":"test"};
+  }
+
+  var root_url = url.substring(0, url.search('/' + base_dir)) + '/';
+
+  // Regular expression to find libgpuarray version directory in URL.
+  var version_regex = new RegExp("\\/" + base_dir + "(_versions\\/)?([_a-zA-Z.0-9]*)\\/");
+
+  // Get current version
+  var current_version = url.match(version_regex)[0]
+  current_version = current_version.substring(1, current_version.length - 1)
+
+  // Add current version in case versions.json is unavailable
+  if (current_version != "libgpuarray" && current_version != "html") {
+    ver = current_version.replace("libgpuarray_versions/", "")
+    versions_dir[ver] = current_version
+  }
+
+  function build_vswitch() {
+  // Build HTML string for version selector, based on ReadTheDocs theme's versions.html
+
+    var vlabel = current_version.replace("libgpuarray_versions/", "");
+    if (vlabel == 'libgpuarray') {
+      vlabel = 'release';
+    }
+    var vswitch = ['<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions" align=left>'];
+    vswitch.push('<span class="rst-current-version" data-toggle="rst-current-version">');
+    vswitch.push('<span class="fa fa-book"></span>');
+    vswitch.push('v: ', vlabel, ' ');
+    vswitch.push('<span class="fa fa-caret-down"></span>');
+    vswitch.push('</span>');
+
+    vswitch.push('<div class="rst-other-versions">');
+
+    vswitch.push('<dl>');
+    vswitch.push('<dt>Versions</dt>');
+    for (var version in versions_dir) {
+      var new_url = url.replace(url.match(version_regex)[0], '/' + versions_dir[version] + '/');
+      vswitch.push('<dd><a href=\"', new_url, '\">', version, '</a></dd>');
+    }
+    vswitch.push('</dl>');
+
+//    vswitch.push('<dl>');
+//    vswitch.push('<dt>Downloads</dt>');
+//    var pdf_url = root_url + current_version + "/libgpuarray.pdf"
+//    vswitch.push('<dd><a href=\"', pdf_url, '\">', 'PDF', '</a></dd>');
+//    vswitch.push('</dl>');
+
+    vswitch.push('<dl>');
+    vswitch.push('<dt>On GitHub</dt>');
+    var git_master = "https://github.com/Theano/libgpuarray"
+    vswitch.push('<dd><a href=\"', git_master + '\">', 'Fork me', '</a></dd>');
+    vswitch.push('</dl>');
+
+    vswitch.push('</div>');
+    vswitch.push('</div>');
+    return vswitch.join('');
+  }
+
+  function build_vswitch_up() {
+  // Build HTML string for version selector, based on ReadTheDocs theme's versions.html
+
+    var vlabel = current_version.replace("libgpuarray_versions/", "");
+    if (vlabel == 'libgpuarray') {
+      vlabel = 'release';
+    }
+    var vswitch = ['<div class="rst-versions-up" data-toggle="rst-versions" role="note" aria-label="versions" align=center>'];
+    vswitch.push('<span class="rst-current-version" data-toggle="rst-current-version">');
+    vswitch.push(vlabel);
+    vswitch.push('<span class="fa fa-caret-down"></span>');
+    vswitch.push('</span>');
+    vswitch.push('</div>');
+    return vswitch.join('');
+  }
+
+// Create HTML for version switcher and assign to placeholder in layout.html.
+  $(document).ready(function() {
+    // Build default switcher
+    $('.version_switcher_placeholder').html(build_vswitch());
+    $('.version_switcher_placeholder_up').html(build_vswitch_up());
+
+    // Check server for other doc versions and update switcher.
+    if (url.startsWith('http')) {
+      $.getJSON(root_url + 'libgpuarray_versions/versions.json', function(data){
+        $.each(data, function(version, dir) {
+            versions_dir[version] = dir;
+        });
+        $('.version_switcher_placeholder').html(build_vswitch());
+        $('.version_switcher_placeholder_up').html(build_vswitch_up());
+      });
+    }
+  });
+})();
diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html
new file mode 100644
index 0000000000..47d86e61c3
--- /dev/null
+++ b/doc/_templates/layout.html
@@ -0,0 +1,39 @@
+{% extends "!layout.html" %}
+
+{% block footer %}
+{{ super() }}
+<script type="text/javascript">
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-168290-9']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = 'https://ssl.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+</script>
+
+<!--Insert version switcher at bottom and top of sidebar-->
+<script type="text/javascript">
+  var span = document.createElement("SPAN")
+  var span_class = document.createAttribute("class");
+  span_class.value = "version_switcher_placeholder";
+  span.setAttributeNode(span_class);
+  var div = document.createElement("DIV");
+  div.appendChild(span);
+  var nav_div = document.getElementsByClassName("wy-nav-side")[0];
+  nav_div.appendChild(div);
+
+  var span_up = document.createElement("SPAN")
+  var span_class_up = document.createAttribute("class");
+  span_class_up.value = "version_switcher_placeholder_up";
+  span_up.setAttributeNode(span_class_up);
+  var nav_div_up = document.getElementsByClassName("version")[0];
+  nav_div_up.appendChild(span_up);
+</script>
+
+<script type="text/javascript"
+    src="{{ pathto('_static/version_switch.js', 1) }}">
+</script>
+{% endblock %}
diff --git a/doc/c_api.rst b/doc/c_api.rst
index cf3185f9a2..d7bbadd5b6 100644
--- a/doc/c_api.rst
+++ b/doc/c_api.rst
@@ -1,5 +1,7 @@
 C library reference
 ===================
 
-.. doxygenindex::
-   :project: gpuarray
+.. toctree::
+
+   c_api/grouplist
+   c_api/filelist
diff --git a/doc/c_api/file/abi__version_8h.rst b/doc/c_api/file/abi__version_8h.rst
new file mode 100644
index 0000000000..8961e38851
--- /dev/null
+++ b/doc/c_api/file/abi__version_8h.rst
@@ -0,0 +1,4 @@
+File abi_version.h
+==================
+
+.. doxygenfile:: abi_version.h
diff --git a/doc/c_api/file/array_8h.rst b/doc/c_api/file/array_8h.rst
new file mode 100644
index 0000000000..5bc914a3ce
--- /dev/null
+++ b/doc/c_api/file/array_8h.rst
@@ -0,0 +1,4 @@
+File array.h
+============
+
+.. doxygenfile:: array.h
diff --git a/doc/c_api/file/blas_8h.rst b/doc/c_api/file/blas_8h.rst
new file mode 100644
index 0000000000..a006abdd1e
--- /dev/null
+++ b/doc/c_api/file/blas_8h.rst
@@ -0,0 +1,4 @@
+File blas.h
+===========
+
+.. doxygenfile:: blas.h
diff --git a/doc/c_api/file/buffer_8h.rst b/doc/c_api/file/buffer_8h.rst
new file mode 100644
index 0000000000..845af0606e
--- /dev/null
+++ b/doc/c_api/file/buffer_8h.rst
@@ -0,0 +1,4 @@
+File buffer.h
+=============
+
+.. doxygenfile:: buffer.h
diff --git a/doc/c_api/file/buffer__blas_8h.rst b/doc/c_api/file/buffer__blas_8h.rst
new file mode 100644
index 0000000000..cd654103d9
--- /dev/null
+++ b/doc/c_api/file/buffer__blas_8h.rst
@@ -0,0 +1,4 @@
+File buffer_blas.h
+==================
+
+.. doxygenfile:: buffer_blas.h
diff --git a/doc/c_api/file/buffer__collectives_8h.rst b/doc/c_api/file/buffer__collectives_8h.rst
new file mode 100644
index 0000000000..053c71bab6
--- /dev/null
+++ b/doc/c_api/file/buffer__collectives_8h.rst
@@ -0,0 +1,4 @@
+File buffer_collectives.h
+=========================
+
+.. doxygenfile:: buffer_collectives.h
diff --git a/doc/c_api/file/cache_8h.rst b/doc/c_api/file/cache_8h.rst
new file mode 100644
index 0000000000..2007da1420
--- /dev/null
+++ b/doc/c_api/file/cache_8h.rst
@@ -0,0 +1,4 @@
+File cache.h
+============
+
+.. doxygenfile:: cache.h
diff --git a/doc/c_api/file/collectives_8h.rst b/doc/c_api/file/collectives_8h.rst
new file mode 100644
index 0000000000..8e65a4367a
--- /dev/null
+++ b/doc/c_api/file/collectives_8h.rst
@@ -0,0 +1,4 @@
+File collectives.h
+==================
+
+.. doxygenfile:: collectives.h
diff --git a/doc/c_api/file/config_8h.rst b/doc/c_api/file/config_8h.rst
new file mode 100644
index 0000000000..24efd0ead4
--- /dev/null
+++ b/doc/c_api/file/config_8h.rst
@@ -0,0 +1,4 @@
+File config.h
+=============
+
+.. doxygenfile:: config.h
diff --git a/doc/c_api/file/dyn__load_8h.rst b/doc/c_api/file/dyn__load_8h.rst
new file mode 100644
index 0000000000..a62f8a5ed2
--- /dev/null
+++ b/doc/c_api/file/dyn__load_8h.rst
@@ -0,0 +1,4 @@
+File dyn_load.h
+===============
+
+.. doxygenfile:: dyn_load.h
diff --git a/doc/c_api/file/elemwise_8h.rst b/doc/c_api/file/elemwise_8h.rst
new file mode 100644
index 0000000000..bb00feb808
--- /dev/null
+++ b/doc/c_api/file/elemwise_8h.rst
@@ -0,0 +1,4 @@
+File elemwise.h
+===============
+
+.. doxygenfile:: elemwise.h
diff --git a/doc/c_api/file/error_8h.rst b/doc/c_api/file/error_8h.rst
new file mode 100644
index 0000000000..5b64637104
--- /dev/null
+++ b/doc/c_api/file/error_8h.rst
@@ -0,0 +1,4 @@
+File error.h
+============
+
+.. doxygenfile:: gpuarray/error.h
diff --git a/doc/c_api/file/ext__cuda_8h.rst b/doc/c_api/file/ext__cuda_8h.rst
new file mode 100644
index 0000000000..2ea42ec8a9
--- /dev/null
+++ b/doc/c_api/file/ext__cuda_8h.rst
@@ -0,0 +1,4 @@
+File ext_cuda.h
+===============
+
+.. doxygenfile:: ext_cuda.h
diff --git a/doc/c_api/file/extension_8h.rst b/doc/c_api/file/extension_8h.rst
new file mode 100644
index 0000000000..dcfed38cee
--- /dev/null
+++ b/doc/c_api/file/extension_8h.rst
@@ -0,0 +1,4 @@
+File extension.h
+================
+
+.. doxygenfile:: extension.h
diff --git a/doc/c_api/file/integerfactoring_8h.rst b/doc/c_api/file/integerfactoring_8h.rst
new file mode 100644
index 0000000000..14aa37162c
--- /dev/null
+++ b/doc/c_api/file/integerfactoring_8h.rst
@@ -0,0 +1,4 @@
+File integerfactoring.h
+=======================
+
+.. doxygenfile:: integerfactoring.h
diff --git a/doc/c_api/file/kernel_8h.rst b/doc/c_api/file/kernel_8h.rst
new file mode 100644
index 0000000000..e5e0842696
--- /dev/null
+++ b/doc/c_api/file/kernel_8h.rst
@@ -0,0 +1,4 @@
+File kernel.h
+=============
+
+.. doxygenfile:: kernel.h
diff --git a/doc/c_api/file/libclblas_8h.rst b/doc/c_api/file/libclblas_8h.rst
new file mode 100644
index 0000000000..e66d2043cc
--- /dev/null
+++ b/doc/c_api/file/libclblas_8h.rst
@@ -0,0 +1,4 @@
+File libclblas.h
+================
+
+.. doxygenfile:: libclblas.h
diff --git a/doc/c_api/file/libclblast_8h.rst b/doc/c_api/file/libclblast_8h.rst
new file mode 100644
index 0000000000..6bb0971391
--- /dev/null
+++ b/doc/c_api/file/libclblast_8h.rst
@@ -0,0 +1,4 @@
+File libclblast.h
+=================
+
+.. doxygenfile:: libclblast.h
diff --git a/doc/c_api/file/libcublas_8h.rst b/doc/c_api/file/libcublas_8h.rst
new file mode 100644
index 0000000000..47546c7598
--- /dev/null
+++ b/doc/c_api/file/libcublas_8h.rst
@@ -0,0 +1,4 @@
+File libcublas.h
+================
+
+.. doxygenfile:: libcublas.h
diff --git a/doc/c_api/file/libcuda_8h.rst b/doc/c_api/file/libcuda_8h.rst
new file mode 100644
index 0000000000..fbc0f3219a
--- /dev/null
+++ b/doc/c_api/file/libcuda_8h.rst
@@ -0,0 +1,4 @@
+File libcuda.h
+==============
+
+.. doxygenfile:: libcuda.h
diff --git a/doc/c_api/file/libnccl_8h.rst b/doc/c_api/file/libnccl_8h.rst
new file mode 100644
index 0000000000..1cb8111268
--- /dev/null
+++ b/doc/c_api/file/libnccl_8h.rst
@@ -0,0 +1,4 @@
+File libnccl.h
+==============
+
+.. doxygenfile:: libnccl.h
diff --git a/doc/c_api/file/libnvrtc_8h.rst b/doc/c_api/file/libnvrtc_8h.rst
new file mode 100644
index 0000000000..7949afc243
--- /dev/null
+++ b/doc/c_api/file/libnvrtc_8h.rst
@@ -0,0 +1,4 @@
+File libnvrtc.h
+===============
+
+.. doxygenfile:: libnvrtc.h
diff --git a/doc/c_api/file/libopencl_8h.rst b/doc/c_api/file/libopencl_8h.rst
new file mode 100644
index 0000000000..a2a1b8e786
--- /dev/null
+++ b/doc/c_api/file/libopencl_8h.rst
@@ -0,0 +1,4 @@
+File libopencl.h
+================
+
+.. doxygenfile:: libopencl.h
diff --git a/doc/c_api/file/private_8h.rst b/doc/c_api/file/private_8h.rst
new file mode 100644
index 0000000000..9d6e0c0a03
--- /dev/null
+++ b/doc/c_api/file/private_8h.rst
@@ -0,0 +1,4 @@
+File private.h
+==============
+
+.. doxygenfile:: private.h
diff --git a/doc/c_api/file/private__cuda_8h.rst b/doc/c_api/file/private__cuda_8h.rst
new file mode 100644
index 0000000000..4ca763829b
--- /dev/null
+++ b/doc/c_api/file/private__cuda_8h.rst
@@ -0,0 +1,4 @@
+File private_cuda.h
+===================
+
+.. doxygenfile:: private_cuda.h
diff --git a/doc/c_api/file/private__opencl_8h.rst b/doc/c_api/file/private__opencl_8h.rst
new file mode 100644
index 0000000000..6e71d1a67a
--- /dev/null
+++ b/doc/c_api/file/private__opencl_8h.rst
@@ -0,0 +1,4 @@
+File private_opencl.h
+=====================
+
+.. doxygenfile:: private_opencl.h
diff --git a/doc/c_api/file/strb_8h.rst b/doc/c_api/file/strb_8h.rst
new file mode 100644
index 0000000000..a87df4558b
--- /dev/null
+++ b/doc/c_api/file/strb_8h.rst
@@ -0,0 +1,4 @@
+File strb.h
+===========
+
+.. doxygenfile:: strb.h
diff --git a/doc/c_api/file/types_8h.rst b/doc/c_api/file/types_8h.rst
new file mode 100644
index 0000000000..b7b6027f14
--- /dev/null
+++ b/doc/c_api/file/types_8h.rst
@@ -0,0 +1,4 @@
+File types.h
+============
+
+.. doxygenfile:: types.h
diff --git a/doc/c_api/file/util_8h.rst b/doc/c_api/file/util_8h.rst
new file mode 100644
index 0000000000..470b783b3d
--- /dev/null
+++ b/doc/c_api/file/util_8h.rst
@@ -0,0 +1,4 @@
+File util.h
+===========
+
+.. doxygenfile:: util.h
diff --git a/doc/c_api/file/xxhash_8h.rst b/doc/c_api/file/xxhash_8h.rst
new file mode 100644
index 0000000000..9f69b14389
--- /dev/null
+++ b/doc/c_api/file/xxhash_8h.rst
@@ -0,0 +1,4 @@
+File xxhash.h
+=============
+
+.. doxygenfile:: xxhash.h
diff --git a/doc/c_api/filelist.rst b/doc/c_api/filelist.rst
new file mode 100644
index 0000000000..78a5f5378b
--- /dev/null
+++ b/doc/c_api/filelist.rst
@@ -0,0 +1,7 @@
+File list
+=========
+
+.. toctree::
+   :glob:
+
+   file/*
diff --git a/doc/c_api/group/group__aflags.rst b/doc/c_api/group/group__aflags.rst
new file mode 100644
index 0000000000..352d021f77
--- /dev/null
+++ b/doc/c_api/group/group__aflags.rst
@@ -0,0 +1,5 @@
+Group aflags
+============
+
+.. doxygengroup:: aflags
+   :no-link:
diff --git a/doc/c_api/group/group__alloc__flags.rst b/doc/c_api/group/group__alloc__flags.rst
new file mode 100644
index 0000000000..bdfb08cd9f
--- /dev/null
+++ b/doc/c_api/group/group__alloc__flags.rst
@@ -0,0 +1,5 @@
+Group alloc_flags
+=================
+
+.. doxygengroup:: alloc_flags
+   :no-link:
diff --git a/doc/c_api/group/group__eflags.rst b/doc/c_api/group/group__eflags.rst
new file mode 100644
index 0000000000..4e7ae16da3
--- /dev/null
+++ b/doc/c_api/group/group__eflags.rst
@@ -0,0 +1,5 @@
+Group eflags
+============
+
+.. doxygengroup:: eflags
+   :no-link:
diff --git a/doc/c_api/group/group__elem__call__flags.rst b/doc/c_api/group/group__elem__call__flags.rst
new file mode 100644
index 0000000000..8aece59927
--- /dev/null
+++ b/doc/c_api/group/group__elem__call__flags.rst
@@ -0,0 +1,5 @@
+Group elem_call_flags
+=====================
+
+.. doxygengroup:: elem_call_flags
+   :no-link:
diff --git a/doc/c_api/group/group__elem__flags.rst b/doc/c_api/group/group__elem__flags.rst
new file mode 100644
index 0000000000..9134988bd3
--- /dev/null
+++ b/doc/c_api/group/group__elem__flags.rst
@@ -0,0 +1,5 @@
+Group elem_flags
+================
+
+.. doxygengroup:: elem_flags
+   :no-link:
diff --git a/doc/c_api/group/group__props.rst b/doc/c_api/group/group__props.rst
new file mode 100644
index 0000000000..cf950efe80
--- /dev/null
+++ b/doc/c_api/group/group__props.rst
@@ -0,0 +1,5 @@
+Group props
+===========
+
+.. doxygengroup:: props
+   :no-link:
diff --git a/doc/c_api/grouplist.rst b/doc/c_api/grouplist.rst
new file mode 100644
index 0000000000..f63ef2179a
--- /dev/null
+++ b/doc/c_api/grouplist.rst
@@ -0,0 +1,7 @@
+Group list
+==========
+
+.. toctree::
+   :glob:
+
+   group/*
diff --git a/doc/conf.py b/doc/conf.py
index 9b5da342ef..0d661ba3ee 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -17,6 +17,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, os.path.abspath('..'))
+import versioneer
 
 # -- General configuration -----------------------------------------------------
 
@@ -25,8 +26,16 @@
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo',
-              'sphinx.ext.ifconfig', 'breathe']
+extensions = ['sphinx.ext.autodoc',
+              'sphinx.ext.doctest',
+              'sphinx.ext.todo',
+              'sphinx.ext.napoleon',
+#              'sphinx.ext.linkcode',
+              'breathe']
+
+todo_include_todos = True
+napoleon_google_docstring = False
+napoleon_include_special_with_doc = False
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -42,16 +51,23 @@
 
 # General information about the project.
 project = u'gpuarray'
-copyright = u'2012, Arnaud Bergeron'
+copyright = u'2012--2017, Arnaud Bergeron'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
-# The short X.Y version.
-version = '0.2'
+
+# We need this hokey-pokey because versioneer needs the current
+# directory to be the root of the project to work.
+_curpath = os.getcwd()
+os.chdir(os.path.dirname(os.path.dirname(__file__)))
 # The full version, including alpha/beta/rc tags.
-release = '0.2'
+release = versioneer.get_version()
+# The short X.Y version.
+version = '.'.join(release.split('.')[:2])
+os.chdir(_curpath)
+del _curpath
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -61,21 +77,21 @@
 # non-false value, then it is used:
 #today = ''
 # Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+today_fmt = '%B %d, %Y'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_dirs = ['_build', 'scripts']
 
 # The reST default role (used for this markup: `text`) to use for all documents.
 #default_role = None
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+add_function_parentheses = True
 
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+add_module_names = True
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
@@ -87,12 +103,20 @@
 # A list of ignored prefixes for module index sorting.
 #modindex_common_prefix = []
 
-
 # -- Options for HTML output ---------------------------------------------------
 
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-html_theme = 'default'
+
+if os.environ.get('READTHEDOCS') != 'True':
+    try:
+        import sphinx_rtd_theme
+    except ImportError:
+        pass
+    else:
+        html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+    html_theme = 'sphinx_rtd_theme'
+
+def setup(app):
+    app.add_stylesheet('fix_rtd.css')
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -125,11 +149,7 @@
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
+html_last_updated_fmt = '%b %d, %Y'
 
 # Custom sidebar templates, maps document names to template names.
 #html_sidebars = {}
@@ -168,6 +188,37 @@
 htmlhelp_basename = 'gpuarraydoc'
 
 
+# Options for the linkcode extension
+# ----------------------------------
+# Resolve function
+# This function is used to populate the (source) links in the API
+
+# XXX: This is broken for now since it doesn't work for cython modules
+def linkcode_resolve(domain, info):
+    def find_source():
+        obj = sys.modules[info['module']]
+        for part in info['fullname'].split('.'):
+            obj = getattr(obj, part)
+        import inspect
+        import os
+        fn = inspect.getsourcefile(obj)
+        fn = os.path.relpath(fn, start=os.path.dirname(pygpu.__file__))
+        source, lineno = inspect.getsourcelines(obj)
+        return fn, lineno, lineno + len(source) - 1
+
+    if domain != 'py' or not info['module']:
+        return None
+    try:
+        filename = 'libgpuarray/pygpu/%s#L%d-L%d' % find_source()
+    except Exception:
+        filename = info['module'].replace('.', '/') + '.py'
+    import subprocess
+    tag = subprocess.Popen(['git', 'rev-parse', 'HEAD'],
+                           stdout=subprocess.PIPE,
+                           universal_newlines=True).communicate()[0][:-1]
+    return "https://github.com/Theano/libgpuarray/blob/%s/%s" % (tag, filename)
+
+
 # -- Options for LaTeX output --------------------------------------------------
 
 latex_elements = {
diff --git a/doc/index.rst b/doc/index.rst
index 79496a86cd..86524aec29 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -13,7 +13,7 @@ Contents:
 
    why
    installation
-   pyapi
+   pyapi/pygpu
    c_api
 
 Indices and tables
diff --git a/doc/installation.rst b/doc/installation.rst
index 10cfb682f5..dfd96f5ace 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -1,39 +1,70 @@
 Installation
 ============
 
-The library is routinely tested on OS X and linux and, less
-frequently, on Windows.  The OS most frequently tested are:
+The library is routinely tested on linux and, less frequently, on
+Windows and Mac OS X.  The OS most frequently tested are:
 
  - Debian 6
- - Ubuntu 14.04
- - Mac OS X 10.11
+ - Ubuntu 16.04
+ - macOS 10.12
  - Windows 7
 
 It should also work on any decently recent OS not listed here. If you
 get an error during the build on your favorite OS, please report it
 and we will attempt to fix it.
 
-Requirements
-------------
+Conda
+-----
+
+The easiest way to install libgpuarray is with conda::
+
+  conda install pygpu
+
+This will also install the libgpuarray package automatically.
+
+This should work on Linux, Mac OS and Windows.
+
+This is also available in packages in conda-forge. They could be more
+up to date::
+
+  conda install -c conda-forge pygpu
+
+Build Requirements
+------------------
 
  - cmake >= 3.0 (cmake_).
  - a c99-compliant compiler (or MSVC if on windows).
- - (optional) CUDA >= 6.5 (cuda_).
- - (optional) OpenCL runtime.
- - (optional) clBLAS (clblas_).
  - (optional) libcheck (check_) to run the C tests.
  - (optional) python (python_) for the python bindings.
  - (optional) mako (mako_) for development or running the python bindings.
- - (optional) Cython >= 0.21 (cython_) for the python bindings.
+ - (optional) Cython >= 0.25 (cython_) for the python bindings.
  - (optional) nosetests (nosetests_) to run the python tests.
 
-.. note::
-   If you have neither an OpenCL runtime or a CUDA runtime, the
-   library might still build, but will be rather useless.
+Run Requirements
+----------------
+
+No matter what was available at build time, this library comes with
+dynamic loaders for the following libraries.  You don't need to have
+any of this available, but you won't be able to use associated
+functionality.
+
+ * For CUDA:
+
+   - CUDA (cuda_) version 7.0 or more, with the appropriate driver
+   - (optional) NCCL (nccl_) for the collectives interface
+
+ * For OpenCL:
+
+   - OpenCL version 1.2 or more
+   - (optional) clBLAS (clblas_) or CLBlast (clblast_) for blas functionality
+
+   .. note::
+
+      The OpenCL that comes with OS X is fundamentally broken and
+      doesn't work with some of the kernels in the library.  You can
+      use it at your own risk, but don't report problems with it we
+      can't fix them.
 
-.. note::
-   We support CUDA GPUs with `compute capability 2.0 (Fermi)
-   <https://developer.nvidia.com/cuda-gpus>`_ and up.
 
 Download
 --------
@@ -43,8 +74,8 @@ Download
   git clone https://github.com/Theano/libgpuarray.git
   cd libgpuarray
 
-Step-by-step install
---------------------
+Step-by-step install: system library (as admin)
+-----------------------------------------------
 
 extract/clone the source to <dir>
 
@@ -74,25 +105,36 @@ like this:
 
   python setup.py build_ext -L $MY_PREFIX/lib -I $MY_PREFIX/include
 
+If installed globally under Linux (in /usr/local), you might have to run:
+
+.. code-block:: bash
+
+   $ sudo ldconfig
+
+to make the linker know that there are new libraries available.  You
+can also reboot the machine to do that.
+
+
+Step-by-step install: user library
+----------------------------------
 
 If you can not or do not want to install it for every user of that
 computer, you can install them in your home directory like this:
 ::
 
   cd <dir>
+  rm -rf ~/.local/lib/libgpuarray* ~/.local/include/gpuarray  ~/.local/lib/python*/site-packages/pygpu*
   rm -rf build Build
   mkdir Build
   cd Build
   cmake .. -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_BUILD_TYPE=Release
   make
   make install
-  make test
+  DEVICE="<test device>" make test
 
   cd ..
 
   # Run the following export and add them in your ~/.bashrc file
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.local/lib64/
-  export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib64/
   export CPATH=$CPATH:~/.local/include
   export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib
   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.local/lib
@@ -100,41 +142,30 @@ computer, you can install them in your home directory like this:
   python setup.py build
   python setup.py install --user
   cd
-  python -c "import pygpu;pygpu.test()"
-
-
-Linux-specific instructions
----------------------------
-
-If installed globally (in /usr/local), you might have to run:
-
-.. code-block:: bash
-
-   $ sudo ldconfig
-
-to make the linker know that there are new libraries available.  You
-can also reboot the machine to do that.
+  DEVICE="<test device>" python -c "import pygpu;pygpu.test()"
 
+Change ``DEVICE="<test device>"`` to the GPU device you want to use for testing.
 
 Mac-specific instructions
 -------------------------
 
-To get the compiler you need to install Xcode which is available for
-free from the App Store.  Don't forget to install the command-line
-tools afterwards.
-
-On Xcode 4.x these are installed by going to the download tab of the
-preferences window and selecting the "Command-line Tools" download.
-
-If you have Xcode 5, ensure you update to 5.0.2 or later.  Prior
-versions will not look in /usr/local for includes or libraries and
-this will cause a lot of errors.  You can update by using the
-"Software Update..." function of the Apple menu or by running
-'xcode-select --install' on the command line.
+The only supported compiler is the clang version that comes with
+Xcode.  Select the appropriate version of Xcode for you version of
+macOS.
 
 It might be possible to use a version of gcc built using Homebrew or
 MacPorts, but this is untested and unsupported.
 
+If on OS X 10.11 or macOS 10.12 and later and using the system python,
+you will have to use a virtualenv to use the python module.  This is
+due to a restriction of the new SIP feature about loading libraries.
+
+It appears that on some versions, /usr/local is not in the default
+compiler paths so you might need to add ``-L /usr/local/lib -I
+/usr/local/include`` to the ``setup.py build`` command or export the
+paths like for a custom path install.
+
+
 Windows-specific instructions
 -----------------------------
 
@@ -148,13 +179,14 @@ If you prefer a command-line approach, cmake is available as a console
 program with the same options as the Unix variant.  You can select the
 nmake builder by passing ``-G "NMake Makefiles"`` to cmake.
 
-Since there is no standard install location on Windows, there is no
-install step.  It is up to you to copy the headers and libraries to an
-appropriate place.
+There is no standard install location on Windows, but you can specify a custom
+location by passing ``-DCMAKE_INSTALL_PREFIX=%LIBDIR%`` to cmake. You can then
+install using ``cmake --build . --target install`` after ``nmake``.
 
-If you don't have Visual Studio installed, you can get the free
-Express version from `here <http://www.visualstudio.com/>`_ in the
-downloads section (select the "for Windows" edition).
+If you don't have Visual Studio installed, you can get the free `Visual Studio
+Community edition <https://www.visualstudio.com/vs/community/>`_,
+which has compilation tools for python 3.5 and up. For python 2.7, install
+`Microsoft Visual C++ Compiler for Python 2.7 <https://www.microsoft.com/en-ca/download/details.aspx?id=44266>`_.
 
 .. warning::
    While you may get the library to compile using cygwin, this is not
@@ -171,9 +203,8 @@ Running Tests
    everything is ok even if you intend on just using the C library.
 
 To run the C tests, enter the build directory (the one where you ran
-cmake) and run 'make test'.  It will run using the first OpenCL and
-the first CUDA device it finds skipping these if the corresponding
-backend wasn't built.
+cmake), select a target device by exporting DEVICE (or
+GPUARRAY_TEST_DEVICE) and run 'make test'.
 
 If you get an error message similar to this one:
 
@@ -186,7 +217,7 @@ If you get an error message similar to this one:
 This means either you don't have check installed or it wasn't found by
 the cmake detection script.
 
-To run the python tests, install pygpu, then move outside its
+To run the python tests, install pygpu, then **move outside** its
 directory and run this command:
 
 ::
@@ -205,18 +236,22 @@ you can confirm which device it is running on.
    only the codename of the architecture the GPU belongs to (e.g.
    'Tahiti').
 
-.. _cmake: http://cmake.org/
+.. _cmake: https://cmake.org/
 
 .. _clblas: https://github.com/clMathLibraries/clBLAS
 
+.. _clblast: https://github.com/CNugteren/CLBlast
+
 .. _cuda: https://developer.nvidia.com/category/zone/cuda-zone
 
+.. _nccl: https://github.com/NVIDIA/nccl
+
 .. _check: http://check.sourceforge.net/
 
-.. _python: http://python.org/
+.. _python: https://python.org/
 
 .. _cython: http://cython.org/
 
-.. _nosetests: http://nose.readthedocs.org/en/latest/
+.. _nosetests: https://nose.readthedocs.org/en/latest/
 
 .. _mako: http://www.makotemplates.org/
diff --git a/doc/pyapi.rst b/doc/pyapi.rst
deleted file mode 100644
index 8a5b94837a..0000000000
--- a/doc/pyapi.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Python module reference
-=======================
-
-.. automodule:: pygpu
-   :members:
-
-   .. automodule:: pygpu.gpuarray
-      :members:
-
-   .. automodule:: pygpu.elemwise
-      :members: ElemwiseKernel
-
-   .. automodule:: pygpu.reduction
-      :members: ReductionKernel
-
-   .. automodule:: pygpu._array
-      :members:
diff --git a/doc/pyapi/pygpu.rst b/doc/pyapi/pygpu.rst
new file mode 100644
index 0000000000..2e30064900
--- /dev/null
+++ b/doc/pyapi/pygpu.rst
@@ -0,0 +1,65 @@
+pygpu package
+=============
+
+pygpu.gpuarray module
+---------------------
+
+.. automodule:: pygpu.gpuarray
+    :members:
+    :undoc-members:
+
+pygpu.elemwise module
+---------------------
+
+.. automodule:: pygpu.elemwise
+    :members:
+    :undoc-members:
+
+pygpu.operations module
+-----------------------
+
+.. automodule:: pygpu.operations
+    :members:
+    :undoc-members:
+
+pygpu.reduction module
+----------------------
+
+.. automodule:: pygpu.reduction
+    :members:
+    :undoc-members:
+
+pygpu.blas module
+-----------------
+
+.. automodule:: pygpu.blas
+    :members:
+    :undoc-members:
+
+pygpu.collectives module
+------------------------
+
+.. automodule:: pygpu.collectives
+    :members:
+    :undoc-members:
+
+pygpu.dtypes module
+-------------------
+
+.. automodule:: pygpu.dtypes
+    :members:
+    :undoc-members:
+
+pygpu.tools module
+------------------
+
+.. automodule:: pygpu.tools
+    :members:
+    :undoc-members:
+
+Module contents
+---------------
+
+.. automodule:: pygpu
+    :members:
+    :undoc-members:
diff --git a/make.bat b/make.bat
new file mode 100755
index 0000000000..3402206e00
--- /dev/null
+++ b/make.bat
@@ -0,0 +1,10 @@
+REM This helps repetitive builds on windows
+REM It needs the compiler you want to use to be available in the shell
+REM and it will build a release version
+
+del bld
+mkdir bld
+cd bld
+cmake .. -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+cd ..
diff --git a/pygpu/__init__.py b/pygpu/__init__.py
index 3ef62bc746..efdbd7a8bb 100644
--- a/pygpu/__init__.py
+++ b/pygpu/__init__.py
@@ -12,10 +12,13 @@ def get_include():
                          concatenate, hstack, vstack, dstack)
 from ._array import ndgpuarray
 
-from .tests import main
-if hasattr(main, "NoseTester"):
-    test = main.NoseTester().test
-else:
-    def test():
-        raise ImportError("The nose module is not installed."
-                          " It is needed for pygpu tests.")
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+
+def test():
+    from . import tests
+    from .tests import main
+    if hasattr(main, "NoseTester"):
+        main.NoseTester(package=tests).test()
+
diff --git a/pygpu/_array.py b/pygpu/_array.py
index 14d5879010..2fc793f838 100644
--- a/pygpu/_array.py
+++ b/pygpu/_array.py
@@ -143,11 +143,11 @@ def __divmod__(self, other):
         mod = self._empty_like_me(dtype=odtype)
 
         if odtype.kind == 'f':
-            tmpl = "div = floor((%(out_t)s)a / (%(out_t)s)b)," \
-                "mod = fmod((%(out_t)s)a, (%(out_t)s)b)"
+            tmpl = ("div = floor((%(out_t)s)a / (%(out_t)s)b),"
+                    "mod = fmod((%(out_t)s)a, (%(out_t)s)b)")
         else:
-            tmpl = "div = (%(out_t)s)a / (%(out_t)s)b," \
-                "mod = a %% b"
+            tmpl = ("div = (%(out_t)s)a / (%(out_t)s)b,"
+                    "mod = a %% b")
 
         ksrc = tmpl % {'out_t': dtype_to_ctype(odtype)}
 
@@ -168,11 +168,11 @@ def __rdivmod__(self, other):
         mod = self._empty_like_me(dtype=odtype)
 
         if odtype.kind == 'f':
-            tmpl = "div = floor((%(out_t)s)a / (%(out_t)s)b)," \
-                "mod = fmod((%(out_t)s)a, (%(out_t)s)b)"
+            tmpl = ("div = floor((%(out_t)s)a / (%(out_t)s)b),"
+                    "mod = fmod((%(out_t)s)a, (%(out_t)s)b)")
         else:
-            tmpl = "div = (%(out_t)s)a / (%(out_t)s)b," \
-                "mod = a %% b"
+            tmpl = ("div = (%(out_t)s)a / (%(out_t)s)b,"
+                    "mod = a %% b")
 
         ksrc = tmpl % {'out_t': dtype_to_ctype(odtype)}
 
diff --git a/pygpu/_elemwise.pyx b/pygpu/_elemwise.pyx
index 4fb80f6399..3e71da7e1f 100644
--- a/pygpu/_elemwise.pyx
+++ b/pygpu/_elemwise.pyx
@@ -1,4 +1,4 @@
-from pygpu.gpuarray import GpuArrayException
+from pygpu.gpuarray import GpuArrayException, UnsupportedException
 from pygpu.gpuarray cimport (gpucontext, GA_NO_ERROR, get_typecode,
                              typecode_to_dtype, GpuContext, GpuArray,
                              get_exc, gpuarray_get_elsize)
@@ -15,6 +15,11 @@ cdef bytes to_bytes(s):
       return <bytes>(<unicode>s).encode('ascii')
   raise TypeError("Can't convert to bytes")
 
+cdef extern from "gpuarray/buffer.h":
+    ctypedef struct gpucontext:
+        pass
+    char *gpucontext_error(gpucontext *ctx, int err)
+
 cdef extern from "gpuarray/elemwise.h":
     ctypedef struct _GpuElemwise "GpuElemwise":
         pass
@@ -40,6 +45,7 @@ cdef extern from "gpuarray/elemwise.h":
 
     cdef int GE_BROADCAST
     cdef int GE_NOCOLLAPSE
+    cdef int GE_PADSHAPE
 
 
 cdef class arg:
@@ -49,6 +55,9 @@ cdef class arg:
         memset(&self.a, 0, sizeof(gpuelemwise_arg))
 
     def __init__(self, name, type, read=False, write=False, scalar=False):
+        # Make sure to clear previous storage
+        # __init__ may be called more than once
+        free(self.a.name)
         self.a.name = strdup(to_bytes(name))
         if self.a.name is NULL:
             raise MemoryError
@@ -60,6 +69,11 @@ cdef class arg:
             self.a.flags |= GE_WRITE
         if scalar:
             self.a.flags |= GE_SCALAR
+        if self.a.flags == 0:
+            raise ValueError('no flags specified for arg %s' % (name,))
+
+    def __dealloc__(self):
+        free(self.a.name)
 
     property name:
         def __get__(self):
@@ -132,7 +146,11 @@ cdef class GpuElemwise:
         finally:
             free(_args)
         if self.ge is NULL:
-            raise GpuArrayException("Could not initialize C GpuElemwise instance")
+            error_message = gpucontext_error(ctx.ctx, 0).decode(encoding='latin-1')
+            # getting the error type this way is fragile, but the alternative is breaking ABI
+            raise (UnsupportedException if
+            "This device does not support double precision" in error_message else
+             GpuArrayException)("Could not initialize C GpuElemwise instance: " + error_message)
 
     def __dealloc__(self):
         cdef unsigned int i
@@ -185,9 +203,19 @@ cdef class GpuElemwise:
     def __call__(self, *args, **kwargs):
         cdef unsigned int i
         cdef int err
+        cdef int flags
+
+        flags = 0
+        if kwargs.pop('broadcast', True):
+            flags |= GE_BROADCAST
+        if kwargs.pop('padshape', True):
+            flags |= GE_PADSHAPE
+
+        if len(kwargs) != 0:
+            raise TypeError("Unknown keyword argument: %s" % list(kwargs.keys())[0])
 
         for i, arg in enumerate(args):
             self._setarg(i, arg)
-        err = GpuElemwise_call(self.ge, self.callbuf, GE_BROADCAST if kwargs.get('broadcast', True) else 0)
+        err = GpuElemwise_call(self.ge, self.callbuf, flags)
         if err != GA_NO_ERROR:
             raise get_exc(err)("Could not call GpuElemwise")
diff --git a/pygpu/_version.py b/pygpu/_version.py
new file mode 100644
index 0000000000..434e940fe1
--- /dev/null
+++ b/pygpu/_version.py
@@ -0,0 +1,520 @@
+
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "v"
+    cfg.parentdir_prefix = "libgpuarray-"
+    cfg.versionfile_source = "pygpu/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
diff --git a/pygpu/basic.py b/pygpu/basic.py
new file mode 100644
index 0000000000..90ffd93b49
--- /dev/null
+++ b/pygpu/basic.py
@@ -0,0 +1,77 @@
+from string import Template
+from .gpuarray import GpuArray, GpuKernel, SIZE, dtype_to_ctype
+import numpy
+
+def _generate_kernel(ctx, cols, dtype, upper=True):
+    tmpl = Template("""
+    #include "cluda.h"
+    KERNEL void extract_tri(GLOBAL_MEM ${ctype} *a, ga_size a_off, ga_uint N) {
+        a = (GLOBAL_MEM ${ctype} *)(((GLOBAL_MEM char *)a) + a_off);
+        unsigned int idx = GID_1 * LDIM_0 * GDIM_0 +
+                           GID_0 * LDIM_0 + LID_0;
+        unsigned int ix = idx/${cols};
+        unsigned int iy = idx%${cols};
+        if (idx < N) {
+            if (ix ${le} iy)
+                a[idx] = 0.0;
+        }
+    }
+    """)
+    if upper:
+        le = '>'
+    else:
+        le = '<'
+    ctype = dtype_to_ctype(dtype)
+    src = tmpl.substitute(cols=cols, ctype=ctype, le=le)
+    spec = [GpuArray, SIZE, 'uint32']
+    have_small = False
+    have_double = False
+    have_complex = False
+    if dtype.itemsize < 4:
+        have_small = True
+    if dtype in [numpy.float64, numpy.complex128]:
+        have_double = True
+    if dtype in [numpy.complex64, numpy.complex128]:
+        have_complex = True
+    k = GpuKernel(src, "extract_tri", spec, context=ctx,
+                  have_double=have_double, have_small=have_small,
+                  have_complex=have_complex)
+    return k
+
+
+def triu(A, inplace=True):
+    if A.ndim != 2:
+        raise ValueError("triu only works for 2d arrays")
+    if A.flags.c_contiguous is A.flags.f_contiguous is False:
+        raise ValueError("triu only works for contiguous arrays")
+
+    if not inplace:
+        A = A.copy()
+    if A.flags['F_CONTIGUOUS']:
+        upper = False
+        cols = A.shape[0]
+    else:
+        upper = True
+        cols = A.shape[1]
+    k = _generate_kernel(A.context, cols, A.dtype, upper)
+    k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1])
+    return A
+
+
+def tril(A, inplace=True):
+    if A.ndim != 2:
+        raise ValueError("tril only works for 2d arrays")
+    if A.flags.c_contiguous is A.flags.f_contiguous is False:
+        raise ValueError("tril only works for contiguous arrays")
+
+    if not inplace:
+        A = A.copy()
+    if A.flags['F_CONTIGUOUS']:
+        upper = True
+        cols = A.shape[0]
+    else:
+        upper = False
+        cols = A.shape[1]
+    k = _generate_kernel(A.context, cols, A.dtype, upper)
+    k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1])
+    return A
diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx
index f83322d0a0..cd778f9a77 100644
--- a/pygpu/blas.pyx
+++ b/pygpu/blas.pyx
@@ -10,6 +10,7 @@ cdef extern from "gpuarray/buffer_blas.h":
         cb_conj_trans
 
 cdef extern from "gpuarray/blas.h":
+    int GpuArray_rdot(_GpuArray *X, _GpuArray *Y, _GpuArray *Z, int nocopy)
     int GpuArray_rgemv(cb_transpose transA, double alpha, _GpuArray *A,
                        _GpuArray *X, double beta, _GpuArray *Y, int nocopy)
     int GpuArray_rgemm(cb_transpose transA, cb_transpose transB,
@@ -17,6 +18,16 @@ cdef extern from "gpuarray/blas.h":
                        double beta, _GpuArray *C, int nocopy)
     int GpuArray_rger(double alpha, _GpuArray *X, _GpuArray *Y, _GpuArray *A,
                       int nocopy)
+    int GpuArray_rgemmBatch_3d(
+        cb_transpose transA, cb_transpose transB, double alpha,
+        _GpuArray *A, _GpuArray *B, double beta, _GpuArray *C, int nocopy)
+
+cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) except -1:
+    cdef int err
+    err = GpuArray_rdot(&X.ga, &Y.ga, &Z.ga, nocopy)
+    if err != GA_NO_ERROR:
+        raise GpuArrayException(GpuArray_error(&X.ga, err), err)
+    return 0
 
 cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A,
                               GpuArray X, double beta, GpuArray Y,
@@ -44,9 +55,34 @@ cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A,
         raise GpuArrayException(GpuArray_error(&X.ga, err), err)
     return 0
 
+cdef api int pygpu_blas_rgemmBatch_3d(cb_transpose transA, cb_transpose transB,
+                                      double alpha, GpuArray A, GpuArray B,
+                                      double beta, GpuArray C, bint nocopy) except -1:
+    cdef int err
+    err = GpuArray_rgemmBatch_3d(transA, transB,
+                                 alpha, &A.ga, &B.ga,
+                                 beta, &C.ga, nocopy)
+    if err != GA_NO_ERROR:
+        raise GpuArrayException(GpuArray_error(&A.ga, err), err)
+    return 0
+
+
+def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False):
+    """dot(X, Y, Z=None, overwrite_z=False)
+    """
+    if Z is None:
+        Z = pygpu_empty(0, NULL, X.typecode, GA_ANY_ORDER, X.context, None)
+        overwrite_z = True
+
+    if not overwrite_z:
+        Z = pygpu_copy(Z, GA_ANY_ORDER)
+    pygpu_blas_rdot(X, Y, Z, 0)
+    return Z
 
 def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0,
          GpuArray Y=None, trans_a=False, overwrite_y=False):
+    """gemv(alpha, A, X, beta=0.0, Y=None, trans_a=False, overwrite_y=False)
+    """
     cdef cb_transpose transA
     cdef size_t Yshp
 
@@ -56,14 +92,14 @@ def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0,
         transA = cb_no_trans
 
     if A.ga.nd != 2:
-        raise TypeError, "A is not a matrix"
+        raise TypeError("A is not a matrix")
     if transA == cb_no_trans:
         Yshp = A.ga.dimensions[0]
     else:
         Yshp = A.ga.dimensions[1]
     if Y is None:
         if beta != 0.0:
-            raise ValueError, "Y not provided and beta != 0"
+            raise ValueError("Y not provided and beta != 0")
         Y = pygpu_empty(1, &Yshp, A.ga.typecode, GA_ANY_ORDER, A.context, None)
         overwrite_y = True
 
@@ -75,6 +111,8 @@ def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0,
 
 def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None,
          trans_a=False, trans_b=False, overwrite_c=False):
+    """gemm(alpha, A, B, beta, C=None, trans_a=False, trans_b=False, overwrite_c=False)
+    """
     cdef cb_transpose transA
     cdef cb_transpose transB
     cdef size_t[2] Cshp
@@ -89,9 +127,9 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None,
         transB = cb_no_trans
 
     if A.ga.nd != 2:
-        raise TypeError, "A is not a matrix"
+        raise TypeError("A is not a matrix")
     if B.ga.nd != 2:
-        raise TypeError, "B is not a matrix"
+        raise TypeError("B is not a matrix")
     if transA == cb_no_trans:
         Cshp[0] = A.ga.dimensions[0]
     else:
@@ -102,7 +140,7 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None,
         Cshp[1] = B.ga.dimensions[0]
     if C is None:
         if beta != 0.0:
-            raise ValueError, "C not provided and beta != 0"
+            raise ValueError("C not provided and beta != 0")
         C = pygpu_empty(2, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None)
         overwrite_c = True
 
@@ -114,6 +152,8 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None,
 
 def ger(double alpha, GpuArray X, GpuArray Y, GpuArray A=None,
         overwrite_a=False):
+    """ger(alpha, X, Y, A=None, overwrite_a=False)
+    """
     cdef size_t[2] Ashp
 
     if A is None:
@@ -127,3 +167,45 @@ def ger(double alpha, GpuArray X, GpuArray Y, GpuArray A=None,
     pygpu_blas_rger(alpha, X, Y, A, 0)
 
     return A
+
+def gemmBatch_3d(double alpha, GpuArray A, GpuArray B,
+                 double beta, GpuArray C=None,
+                 trans_a=False, trans_b=False, overwrite_c=False):
+    """gemmBatch_3d(alpha, A, B, beta, C=None, trans_a=False, trans_b=False, overwrite_c=False)
+    """
+    cdef cb_transpose transA
+    cdef cb_transpose transB
+    cdef size_t[3] Cshp
+
+    if trans_a:
+        transA = cb_trans
+    else:
+        transA = cb_no_trans
+    if trans_b:
+        transB = cb_trans
+    else:
+        transB = cb_no_trans
+
+    if A.ga.nd != 3:
+        raise TypeError("A is not a batch of matrices")
+    if B.ga.nd != 3:
+        raise TypeError("B is not a batch of matrices")
+
+    Cshp[0] = A.ga.dimensions[0]
+    if transA == cb_no_trans:
+        Cshp[1] = A.ga.dimensions[1]
+    else:
+        Cshp[1] = A.ga.dimensions[2]
+    if transB == cb_no_trans:
+        Cshp[2] = B.ga.dimensions[2]
+    else:
+        Cshp[2] = B.ga.dimensions[1]
+    if C is None:
+        if beta != 0.0:
+            raise ValueError("C not provided and beta != 0")
+        C = pygpu_empty(3, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None)
+    elif not overwrite_c:
+        C = pygpu_copy(C, GA_ANY_ORDER)
+    pygpu_blas_rgemmBatch_3d(transA, transB, alpha, A, B, beta, C, 0)
+
+    return C
diff --git a/pygpu/collectives.pxd b/pygpu/collectives.pxd
index dd1f677259..44147febf3 100644
--- a/pygpu/collectives.pxd
+++ b/pygpu/collectives.pxd
@@ -4,7 +4,7 @@ cdef extern from "gpuarray/buffer_collectives.h":
     ctypedef struct gpucomm:
         pass
 
-    enum _gpucomm_reduce_ops:
+    enum gpucomm_reduce_ops:
         GA_SUM,
         GA_PROD,
         GA_MAX,
diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx
index e75049a1c4..c2ac3984c2 100644
--- a/pygpu/collectives.pyx
+++ b/pygpu/collectives.pyx
@@ -16,14 +16,16 @@ from pygpu.gpuarray import GpuArrayException
 COMM_ID_BYTES = GA_COMM_ID_BYTES
 
 cdef class GpuCommCliqueId:
-    """Represents a unique id shared among :ref:`GpuComm` communicators which
+    """GpuCommCliqueId(context=None, comm_id=None)
+
+    Represents a unique id shared among :class:`GpuComm` communicators which
     participate in a multi-gpu clique.
 
     Parameters
     ----------
-    context: :ref:`GpuContext`, optional
-        Reference to which gpu this `GpuCommCliqueId` object belongs.
-    comm_id: bytes-like, optional
+    context: GpuContext
+        Reference to which gpu this GpuCommCliqueId object belongs.
+    comm_id: bytes
         Existing unique id to be passed in this object.
 
     """
@@ -36,41 +38,6 @@ cdef class GpuCommCliqueId:
         if comm_id is not None:
             self.comm_id = comm_id
 
-    def __getbuffer__(self, Py_buffer* buffer, int flags):
-        if buffer == NULL:
-            raise BufferError, "NULL buffer view in getbuffer"
-
-        buffer.buf = <char*>self.c_comm_id.internal
-        buffer.obj = self
-        buffer.len = GA_COMM_ID_BYTES * sizeof(char)
-        buffer.readonly = 0
-        buffer.itemsize = sizeof(char)
-        if flags & PyBUF_FORMAT == PyBUF_FORMAT:
-            buffer.format = 'b'
-        else:
-            buffer.format = NULL
-        buffer.ndim = 1
-        if flags & PyBUF_ND == PyBUF_ND:
-            buffer.shape = <Py_ssize_t*>calloc(1, sizeof(Py_ssize_t))
-            buffer.shape[0] = GA_COMM_ID_BYTES
-        else:
-            buffer.shape = NULL
-        if flags & PyBUF_STRIDES == PyBUF_STRIDES:
-            buffer.strides = &buffer.itemsize
-        else:
-            buffer.strides = NULL
-        buffer.suboffsets = NULL
-        buffer.internal = NULL
-        Py_INCREF(self)
-
-    def __releasebuffer__(self, Py_buffer* buffer):
-        if buffer == NULL:
-            raise BufferError, "NULL buffer view in releasebuffer"
-
-        if buffer.shape != NULL:
-            free(buffer.shape)
-        Py_DECREF(self)
-
     def __richcmp__(this, that, int op):
         if type(this) != type(that):
             raise TypeError, "Cannot compare %s with %s" % (type(this), type(that))
@@ -100,7 +67,7 @@ cdef class GpuCommCliqueId:
         raise RuntimeError, "Cannot pickle %s object" % self.__class__.__name__
 
     property comm_id:
-        "Unique clique id to be used by each :ref:`GpuComm` in a group of devices"
+        "Unique clique id to be used by each :class:`GpuComm` in a group of devices"
         def __get__(self):
             cdef bytearray res
             res = self.c_comm_id.internal[:GA_COMM_ID_BYTES]
@@ -115,19 +82,21 @@ cdef class GpuCommCliqueId:
 
 
 cdef class GpuComm:
-    """Represents a communicator which participates in a multi-gpu clique.
+    """GpuComm(cid, ndev, rank)
+
+    Represents a communicator which participates in a multi-gpu clique.
 
     It is used to invoke collective operations to gpus inside its clique.
 
     Parameters
     ----------
-    cid: :ref:`GpuCommCliqueId`
+    cid: GpuCommCliqueId
         Unique id shared among participating communicators.
     ndev: int
         Number of communicators inside the clique.
     rank: int
-        User-defined rank of this communicator inside the clique. It influences
-        order of collective operations.
+        User-defined rank of this communicator inside the clique. It
+        influences order of collective operations.
 
     """
     def __dealloc__(self):
@@ -156,26 +125,31 @@ cdef class GpuComm:
             comm_get_rank(self, &gpurank)
             return gpurank
 
-    def reduce(self, GpuArray src not None, op, GpuArray dest=None, int root=-1):
-        """Reduce collective operation for ranks in a communicator world.
+    def reduce(self, GpuArray src not None, op, GpuArray dest=None,
+               int root=-1):
+        """
+        reduce(self, src, op, dest=None, root=-1)
+
+        Reduce collective operation for ranks in a communicator world.
 
         Parameters
         ----------
-        src: :ref:`GpuArray`
+        src: GpuArray
             Array to be reduced.
-        op: string
+        op: str
             Key indicating operation type.
-        dest: :ref:`GpuArray`, optional
-            Array to collecti reduce operation result.
+        dest: GpuArray
+            Array to collect reduce operation result.
         root: int
-            Rank in `GpuComm` which will collect result.
+            Rank in GpuComm which will collect result.
 
         Notes
         -----
-        * `root` is necessary when invoking from a non-root rank. Root caller
-        does not need to provide `root` argument.
-        * Not providing `dest` argument for a root caller will result in creating
-        a new compatible :ref:`GpuArray` and returning result in it.
+        * `root` is necessary when invoking from a non-root rank. Root
+          caller does not need to provide `root` argument.
+        * Not providing `dest` argument for a root caller will result
+          in creating a new compatible :class:`GpuArray` and returning
+          result in it.
 
         """
         cdef int srank
@@ -193,21 +167,24 @@ cdef class GpuComm:
         comm_reduce(self, src, dest, to_reduce_opcode(op), root)
 
     def all_reduce(self, GpuArray src not None, op, GpuArray dest=None):
-        """AllReduce collective operation for ranks in a communicator world.
+        """
+        all_reduce(self, src, op, dest=None)
+
+        AllReduce collective operation for ranks in a communicator world.
 
         Parameters
         ----------
-        src: :ref:`GpuArray`
+        src: GpuArray
             Array to be reduced.
-        op: string
+        op: str
             Key indicating operation type.
-        dest: :ref:`GpuArray`, optional
+        dest: GpuArray
             Array to collect reduce operation result.
 
         Notes
         -----
         * Not providing `dest` argument for a caller will result in creating
-        a new compatible :ref:`GpuArray` and returning result in it.
+          a new compatible :class:`GpuArray` and returning result in it.
 
         """
         if dest is None:
@@ -215,21 +192,24 @@ cdef class GpuComm:
         comm_all_reduce(self, src, dest, to_reduce_opcode(op))
 
     def reduce_scatter(self, GpuArray src not None, op, GpuArray dest=None):
-        """ReduceScatter collective operation for ranks in a communicator world.
+        """
+        reduce_scatter(self, src, op, dest=None)
+
+        ReduceScatter collective operation for ranks in a communicator world.
 
         Parameters
         ----------
-        src: :ref:`GpuArray`
+        src: GpuArray
             Array to be reduced.
-        op: string
+        op: str
             Key indicating operation type.
-        dest: :ref:`GpuArray`, optional
+        dest: GpuArray
             Array to collect reduce operation scattered result.
 
         Notes
         -----
         * Not providing `dest` argument for a caller will result in creating
-        a new compatible :ref:`GpuArray` and returning result in it.
+          a new compatible :class:`GpuArray` and returning result in it.
 
         """
         if dest is None:
@@ -237,11 +217,14 @@ cdef class GpuComm:
         comm_reduce_scatter(self, src, dest, to_reduce_opcode(op))
 
     def broadcast(self, GpuArray array not None, int root=-1):
-        """Broadcast collective operation for ranks in a communicator world.
+        """
+        broadcast(self, array, root=-1)
+
+        Broadcast collective operation for ranks in a communicator world.
 
         Parameters
         ----------
-        array: :ref:`GpuArray`
+        array: GpuArray
             Array to be reduced.
         root: int
             Rank in `GpuComm` which broadcasts its `array`.
@@ -249,7 +232,7 @@ cdef class GpuComm:
         Notes
         -----
         * `root` is necessary when invoking from a non-root rank. Root caller
-        does not need to provide `root` argument.
+          does not need to provide `root` argument.
 
         """
         if root == -1:
@@ -258,23 +241,27 @@ cdef class GpuComm:
 
     def all_gather(self, GpuArray src not None, GpuArray dest=None,
                    unsigned int nd_up=1):
-        """AllGather collective operation for ranks in a communicator world.
+        """
+        all_gather(self, src, dest=None, nd_up=1)
+
+        AllGather collective operation for ranks in a communicator world.
 
         Parameters
         ----------
-        src: :ref:`GpuArray`
+        src: GpuArray
             Array to be gathered.
-        dest: :ref:`GpuArray`, optional
+        dest: GpuArray
             Array to receive all gathered arrays from ranks in `GpuComm`.
-        nd_up: unsigned int
-            Used when creating result array. Indicates how many extra dimensions
-            user wants result to have. Default is 1, which means that the result
-            will store each rank's gathered array in one extra new dimension.
+        nd_up: int
+            Used when creating result array. Indicates how many extra
+            dimensions user wants result to have. Default is 1, which
+            means that the result will store each rank's gathered
+            array in one extra new dimension.
 
         Notes
         -----
         * Providing `nd_up` == 0 means that gathered arrays will be appended to
-        the dimension with the largest stride.
+          the dimension with the largest stride.
 
         """
         if dest is None:
@@ -289,6 +276,7 @@ cdef dict TO_RED_OP = {
     '*': GA_PROD,
     "prod": GA_PROD,
     "product": GA_PROD,
+    "mul": GA_PROD,
     "max": GA_MAX,
     "maximum": GA_MAX,
     "min": GA_MIN,
diff --git a/pygpu/dtypes.py b/pygpu/dtypes.py
index 1acfdad959..cc4d6b3402 100644
--- a/pygpu/dtypes.py
+++ b/pygpu/dtypes.py
@@ -39,16 +39,19 @@ def register_dtype(dtype, c_names):
     """
     Associate a numpy dtype with its C equivalents.
 
-    :param dtype: type to associate
-    :type dtype: numpy.dtype or string
-    :param c_names: list of C type names
-    :type c_names: str or list
-
     Will register `dtype` for use with the gpuarray module.  If the
     c_names argument is a list then the first element of that list is
     taken as the primary association and will be used for generated C
     code.  The other types will be mapped to the provided dtype when
     going in the other direction.
+
+    Parameters
+    ----------
+    dtype: numpy.dtype or string
+        type to associate
+    c_names: str or list
+        list of C type names
+
     """
     if isinstance(c_names, str):
         c_names = [c_names]
@@ -67,41 +70,24 @@ def register_dtype(dtype, c_names):
         NAME_TO_DTYPE[nm] = dtype
 
 
-def _fill_dtype_registry(respect_windows):
-    from sys import platform
-
+def _fill_dtype_registry():
     register_dtype(np.bool, ["ga_bool", "bool"])
     register_dtype(np.int8, ["ga_byte", "char", "signed char"])
     register_dtype(np.uint8, ["ga_ubyte", "unsigned char"])
-    register_dtype(np.int16, ["ga_short", "short", "signed short", "signed short int", "short signed int"])
-    register_dtype(np.uint16, ["ga_ushort", "unsigned short", "unsigned short int", "short unsigned int"])
+    register_dtype(np.int16, ["ga_short", "short", "signed short",
+                              "signed short int", "short signed int"])
+    register_dtype(np.uint16, ["ga_ushort", "unsigned short",
+                               "unsigned short int", "short unsigned int"])
     register_dtype(np.int32, ["ga_int", "int", "signed int"])
     register_dtype(np.uint32, ["ga_uint", "unsigned", "unsigned int"])
 
-    register_dtype(np.int64, ["ga_long"])
-    register_dtype(np.uint64, ["ga_ulong"])
-    is_64_bit = tuple.__itemsize__ * 8 == 64
-    if is_64_bit:
-        if 'win32' in platform and respect_windows:
-            i64_name = "long long"
-        else:
-            i64_name = "long"
-        register_dtype(np.int64, [i64_name, "%s int" % i64_name,
-                                  "signed %s int" % i64_name,
-                                  "%s signed int" % i64_name])
-        register_dtype(np.uint64, ["unsigned %s" % i64_name,
-                                   "unsigned %s int" % i64_name,
-                                   "%s unsigned int" % i64_name])
-
-    # According to this uintp may not have the same hash as uint32:
-    # http://projects.scipy.org/numpy/ticket/2017
-    # Failing tests tell me this is the case for intp too.
-    if is_64_bit:
-        register_dtype(np.intp, ["ga_long"])
-        register_dtype(np.uintp, ["ga_ulong"])
-    else:
-        register_dtype(np.intp, ["ga_int"])
-        register_dtype(np.uintp, ["ga_uint"])
+    register_dtype(np.int64, ["ga_long", "long int", "signed long int",
+                              "long signed int"])
+    register_dtype(np.uint64, ["ga_ulong", "unsigned long",
+                               "unsigned long int", "long unsigned int"])
+
+    register_dtype(np.intp, ["ga_ssize", "ssize_t"])
+    register_dtype(np.uintp, ["ga_size", "size_t"])
 
     register_dtype(np.float32, ["ga_float", "float"])
     register_dtype(np.float64, ["ga_double", "double"])
@@ -111,21 +97,19 @@ def _fill_dtype_registry(respect_windows):
 # {{{ dtype -> ctype
 
 
-def dtype_to_ctype(dtype, with_fp_tex_hack=False):
+def dtype_to_ctype(dtype):
     """
     Return the C type that corresponds to `dtype`.
 
-    :param dtype: a numpy dtype
+    Parameters
+    ----------
+    dtype: data type
+        a numpy dtype
     """
     if dtype is None:
         raise ValueError("dtype may not be None")
 
     dtype = np.dtype(dtype)
-    if with_fp_tex_hack:
-        if dtype == np.float32:
-            return "fp_tex_float"
-        elif dtype == np.float64:
-            return "fp_tex_double"
 
     return gpuarray.dtype_to_ctype(dtype)
 
diff --git a/pygpu/elemwise.py b/pygpu/elemwise.py
index 906c55b33d..ab6fd55d94 100644
--- a/pygpu/elemwise.py
+++ b/pygpu/elemwise.py
@@ -4,7 +4,8 @@
 from . import gpuarray
 from ._elemwise import GpuElemwise, arg
 
-__all__ = ['GpuElemwise', 'elemwise1', 'elemwise2', 'ielemwise2', 'compare']
+__all__ = ['GpuElemwise', 'arg', 'as_argument',
+           'elemwise1', 'elemwise2', 'ielemwise2', 'compare']
 
 
 def _dtype(o):
@@ -14,7 +15,7 @@ def _dtype(o):
 
 
 def as_argument(o, name, read=False, write=False):
-    if not read and not write:
+    if (not read) and (not write):
         raise ValueError('argument is neither read not write')
     return arg(name, _dtype(o), scalar=not isinstance(o, gpuarray.GpuArray),
                read=read, write=write)
diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd
index 1fcc5a0068..136ef52940 100644
--- a/pygpu/gpuarray.pxd
+++ b/pygpu/gpuarray.pxd
@@ -16,11 +16,15 @@ cdef extern from "numpy/arrayobject.h":
 cdef object PyArray_Empty(int a, np.npy_intp *b, np.dtype c, int d)
 
 cdef extern from "Python.h":
-    int PySlice_GetIndicesEx(slice_object slice, Py_ssize_t length,
+    int PySlice_GetIndicesEx(object slice, Py_ssize_t length,
                              Py_ssize_t *start, Py_ssize_t *stop,
                              Py_ssize_t *step,
                              Py_ssize_t *slicelength) except -1
 
+cdef extern from "gpuarray/config.h":
+    int GPUARRAY_API_VERSION
+    int GPUARRAY_ABI_VERSION
+
 cdef extern from "gpuarray/types.h":
     ctypedef struct gpuarray_type:
         const char *cluda_name
@@ -49,8 +53,6 @@ cdef extern from "gpuarray/types.h":
         GA_NBASE
 
 cdef extern from "gpuarray/util.h":
-    const int gpuarray_api_major
-    const int gpuarray_api_minor
     int gpuarray_register_type(gpuarray_type *t, int *ret)
     size_t gpuarray_get_elsize(int typecode)
     gpuarray_type *gpuarray_get_type(int typecode)
@@ -63,6 +65,8 @@ cdef extern from "gpuarray/error.h":
         GA_UNALIGNED_ERROR, GA_COPY_ERROR, GA_COMM_ERROR
 
 cdef extern from "gpuarray/buffer.h":
+    ctypedef struct gpucontext_props:
+        pass
     ctypedef struct gpucontext:
         pass
     ctypedef struct gpudata:
@@ -72,25 +76,33 @@ cdef extern from "gpuarray/buffer.h":
 
     int gpu_get_platform_count(const char* name, unsigned int* platcount)
     int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount)
-    gpucontext *gpucontext_init(const char *name, int devno, int flags, int *ret)
+
+    int gpucontext_props_new(gpucontext_props **res)
+    int gpucontext_props_cuda_dev(gpucontext_props *p, int devno)
+    int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno)
+    int gpucontext_props_sched(gpucontext_props *p, int sched)
+    int gpucontext_props_set_single_stream(gpucontext_props *p)
+    int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path)
+    int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max)
+    void gpucontext_props_del(gpucontext_props *p)
+
+    int gpucontext_init(gpucontext **res, const char *name, gpucontext_props *p)
     void gpucontext_deref(gpucontext *ctx)
     char *gpucontext_error(gpucontext *ctx, int err)
+    int gpudata_property(gpudata *ctx, int prop_id, void *res)
     int gpucontext_property(gpucontext *ctx, int prop_id, void *res)
     int gpukernel_property(gpukernel *k, int prop_id, void *res)
     gpucontext *gpudata_context(gpudata *)
     gpucontext *gpukernel_context(gpukernel *)
 
-    int GA_CTX_DEFAULT
-    int GA_CTX_MULTI_THREAD
-    int GA_CTX_SINGLE_THREAD
-    int GA_CTX_SINGLE_STREAM
-    int GA_CTX_DISABLE_ALLOCATION_CACHE
+    int GA_CTX_SCHED_AUTO
+    int GA_CTX_SCHED_SINGLE
+    int GA_CTX_SCHED_MULTI
 
     int GA_CTX_PROP_DEVNAME
-    int GA_CTX_PROP_MAXLSIZE
+    int GA_CTX_PROP_UNIQUE_ID
     int GA_CTX_PROP_LMEMSIZE
     int GA_CTX_PROP_NUMPROCS
-    int GA_CTX_PROP_MAXGSIZE
     int GA_CTX_PROP_BIN_ID
     int GA_CTX_PROP_TOTAL_GMEM
     int GA_CTX_PROP_FREE_GMEM
@@ -100,14 +112,18 @@ cdef extern from "gpuarray/buffer.h":
     int GA_CTX_PROP_MAXGSIZE0
     int GA_CTX_PROP_MAXGSIZE1
     int GA_CTX_PROP_MAXGSIZE2
+    int GA_CTX_PROP_LARGEST_MEMBLOCK
+
+    int GA_BUFFER_PROP_SIZE
+
     int GA_KERNEL_PROP_MAXLSIZE
     int GA_KERNEL_PROP_PREFLSIZE
     int GA_KERNEL_PROP_NUMARGS
     int GA_KERNEL_PROP_TYPES
 
     cdef enum ga_usefl:
-        GA_USE_CLUDA, GA_USE_SMALL, GA_USE_DOUBLE, GA_USE_COMPLEX, GA_USE_HALF,
-        GA_USE_BINARY, GA_USE_CUDA, GA_USE_OPENCL
+        GA_USE_SMALL, GA_USE_DOUBLE, GA_USE_COMPLEX, GA_USE_HALF,
+        GA_USE_CUDA, GA_USE_OPENCL
 
 cdef extern from "gpuarray/kernel.h":
     ctypedef struct _GpuKernel "GpuKernel":
@@ -119,11 +135,10 @@ cdef extern from "gpuarray/kernel.h":
                        unsigned int argcount, const int *types, int flags, char **err_str)
     void GpuKernel_clear(_GpuKernel *k)
     gpucontext *GpuKernel_context(_GpuKernel *k)
-    int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *ls, size_t *gs)
+    int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls)
     int GpuKernel_call(_GpuKernel *k, unsigned int n,
-                       const size_t *ls, const size_t *gs,
+                       const size_t *gs, const size_t *ls,
                        size_t shared, void **args)
-    int GpuKernel_binary(_GpuKernel *, size_t *, void **)
 
 cdef extern from "gpuarray/array.h":
     ctypedef struct _GpuArray "GpuArray":
@@ -150,16 +165,13 @@ cdef extern from "gpuarray/array.h":
     ctypedef enum ga_order:
         GA_ANY_ORDER, GA_C_ORDER, GA_F_ORDER
 
+    void GpuArray_fix_flags(_GpuArray *a)
     int GpuArray_empty(_GpuArray *a, gpucontext *ctx,
                        int typecode, int nd, const size_t *dims, ga_order ord)
     int GpuArray_fromdata(_GpuArray *a,
                           gpudata *data, size_t offset, int typecode,
                           unsigned int nd, const size_t *dims,
                           const ssize_t *strides, int writable)
-    int GpuArray_copy_from_host(_GpuArray *a,
-                            gpucontext *ctx, void *buf, int typecode,
-                            unsigned int nd, const size_t *dims,
-                            const ssize_t *strides) nogil
     int GpuArray_view(_GpuArray *v, _GpuArray *a)
     int GpuArray_sync(_GpuArray *a) nogil
     int GpuArray_index(_GpuArray *r, _GpuArray *a, const ssize_t *starts,
@@ -168,6 +180,8 @@ cdef extern from "gpuarray/array.h":
     int GpuArray_setarray(_GpuArray *v, _GpuArray *a)
     int GpuArray_reshape(_GpuArray *res, _GpuArray *a, unsigned int nd,
                          const size_t *newdims, ga_order ord, int nocopy)
+    int GpuArray_reshape_inplace(_GpuArray *a, unsigned int nd,
+                                 const size_t *newdims, ga_order ord)
     int GpuArray_transpose(_GpuArray *res, _GpuArray *a,
                            const unsigned int *new_axes)
 
@@ -196,6 +210,9 @@ cdef extern from "gpuarray/array.h":
 
 cdef extern from "gpuarray/extension.h":
     void *gpuarray_get_extension(const char *)
+    ctypedef struct GpuArrayIpcMemHandle:
+        pass
+
     cdef int GPUARRAY_CUDA_CTX_NOFREE
 
 cdef type get_exc(int errcode)
@@ -220,10 +237,6 @@ cdef int array_fromdata(GpuArray a,
                         gpudata *data, size_t offset, int typecode,
                         unsigned int nd, const size_t *dims,
                         const ssize_t *strides, int writeable) except -1
-cdef int array_copy_from_host(GpuArray a,
-                              gpucontext *ctx, void *buf, int typecode,
-                              unsigned int nd, const size_t *dims,
-                              const ssize_t *strides) except -1
 cdef int array_view(GpuArray v, GpuArray a) except -1
 cdef int array_sync(GpuArray a) except -1
 cdef int array_index(GpuArray r, GpuArray a, const ssize_t *starts,
@@ -253,11 +266,10 @@ cdef int kernel_init(GpuKernel k, gpucontext *ctx,
                      int flags) except -1
 cdef int kernel_clear(GpuKernel k) except -1
 cdef gpucontext *kernel_context(GpuKernel k) except NULL
-cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1
+cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1
 cdef int kernel_call(GpuKernel k, unsigned int n,
-                     const size_t *ls, const size_t *gs,
+                     const size_t *gs, const size_t *ls,
                      size_t shared, void **args) except -1
-cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1
 cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1
 
 cdef int ctx_property(GpuContext c, int prop_id, void *res) except -1
@@ -267,7 +279,7 @@ cdef api GpuContext pygpu_default_context()
 
 cdef api bint pygpu_GpuArray_Check(object o)
 
-cdef api GpuContext pygpu_init(object dev, int flags)
+cdef api GpuContext pygpu_init(object dev, gpucontext_props *p)
 
 cdef api GpuArray pygpu_zeros(unsigned int nd, const size_t *dims,
                               int typecode, ga_order order,
@@ -275,11 +287,6 @@ cdef api GpuArray pygpu_zeros(unsigned int nd, const size_t *dims,
 cdef api GpuArray pygpu_empty(unsigned int nd, const size_t *dims,
                               int typecode, ga_order order,
                               GpuContext context, object cls)
-cdef api GpuArray pygpu_fromhostdata(void *buf, int typecode, unsigned int nd,
-                                     const size_t *dims,
-                                     const ssize_t *strides,
-                                     GpuContext context, object cls)
-
 cdef api GpuArray pygpu_fromgpudata(gpudata *buf, size_t offset, int typecode,
                                     unsigned int nd, const size_t *dims,
                                     const ssize_t *strides, GpuContext context,
@@ -296,6 +303,7 @@ cdef api int pygpu_sync(GpuArray a) except -1
 cdef api GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode)
 
 cdef api np.ndarray pygpu_as_ndarray(GpuArray a)
+cdef np.ndarray _pygpu_as_ndarray(GpuArray a, np.dtype ldtype)
 
 cdef api GpuArray pygpu_index(GpuArray a, const ssize_t *starts,
                               const ssize_t *stops, const ssize_t *steps)
@@ -311,8 +319,10 @@ cdef api GpuArray pygpu_concatenate(const _GpuArray **a, size_t n,
                                     object cls, GpuContext context)
 
 cdef api class GpuContext [type PyGpuContextType, object PyGpuContextObject]:
+    cdef dict __dict__
     cdef gpucontext* ctx
     cdef readonly bytes kind
+    cdef object __weakref__
 
 cdef GpuArray new_GpuArray(object cls, GpuContext ctx, object base)
 
@@ -324,6 +334,7 @@ cdef api class GpuArray [type PyGpuArrayType, object PyGpuArrayObject]:
 
     cdef __index_helper(self, key, unsigned int i, ssize_t *start,
                         ssize_t *stop, ssize_t *step)
+    cdef __cgetitem__(self, idx)
 
 cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]:
     cdef _GpuKernel k
@@ -331,5 +342,5 @@ cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]:
     cdef void **callbuf
     cdef object __weakref__
 
-    cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared)
+    cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared)
     cdef _setarg(self, unsigned int index, int typecode, object o)
diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx
index 42f6f6d3b5..46c5d3ea01 100644
--- a/pygpu/gpuarray.pyx
+++ b/pygpu/gpuarray.pyx
@@ -6,12 +6,23 @@ from libc.string cimport strncmp
 cimport numpy as np
 import numpy as np
 
+import sys
+
 from cpython cimport Py_INCREF, PyNumber_Index
 from cpython.object cimport Py_EQ, Py_NE
 
 def api_version():
-    # major, minor, py
-    return (gpuarray_api_major, gpuarray_api_minor, 0)
+    """api_version()
+    """
+    # (library version, module version)
+    return (GPUARRAY_API_VERSION, 0)
+
+def abi_version():
+    """abi_version()
+    """
+    major_version = GPUARRAY_ABI_VERSION / 1000
+    minor_version = GPUARRAY_ABI_VERSION % 1000
+    return (major_version, minor_version)
 
 np.import_array()
 
@@ -31,6 +42,15 @@ cdef bytes _s(s):
         return s
     raise TypeError("Expected a string")
 
+cdef size_t countis(l, object val):
+    cdef size_t count
+    cdef size_t i
+    count = 0
+    for i in range(len(l)):
+        if l[i] is val:
+            count += 1
+    return count
+
 def cl_wrap_ctx(size_t ptr):
     """
     cl_wrap_ctx(ptr)
@@ -56,10 +76,10 @@ def cuda_wrap_ctx(size_t ptr, bint own):
     Wrap an existing CUDA driver context (CUcontext) into a GpuContext
     class.
 
-    If `own` is true, libgpuarray is now reponsible for the context and
+    If `own` is true, libgpuarray is now responsible for the context and
     it will be destroyed once there are no references to it.
     Otherwise, the context will not be destroyed and it is the calling
-    code's reponsability.
+    code's responsibility.
     """
     cdef gpucontext *(*cuda_make_ctx)(void *, int)
     cdef int flags
@@ -106,11 +126,13 @@ def register_dtype(np.dtype dtype, cname):
     This function return the associted internal typecode for the new
     type.
 
-    :param dtype: new type
-    :type dtype: numpy.dtype
-    :param cname: C name for the type declarations
-    :type cname: string
-    :rtype: int
+    Parameters
+    ----------
+    dtype: numpy.dtype
+        new type
+    cname: str
+        C name for the type declarations
+
     """
     cdef gpuarray_type *t
     cdef int typecode
@@ -168,9 +190,11 @@ cpdef int dtype_to_typecode(dtype) except -1:
 
     Get the internal typecode for a type.
 
-    :param dtype: type to get the code for
-    :type dtype: numpy.dtype
-    :rtype: int
+    Parameters
+    ----------
+    dtype: numpy.dtype
+        type to get the code for
+
     """
     if isinstance(dtype, int):
         return dtype
@@ -190,9 +214,11 @@ def dtype_to_ctype(dtype):
 
     Return the C name for a type.
 
-    :param dtype: type to get the name for
-    :type dtype: numpy.dtype
-    :rtype: string
+    Parameters
+    ----------
+    dtype: numpy.dtype
+        type to get the name for
+
     """
     cdef int typecode = dtype_to_typecode(dtype)
     cdef const gpuarray_type *t = gpuarray_get_type(typecode)
@@ -212,6 +238,35 @@ cdef ga_order to_ga_order(ord) except <ga_order>-2:
     else:
         raise ValueError, "Valid orders are: 'A' (any), 'C' (C), 'F' (Fortran)"
 
+cdef int strides_ok(GpuArray a, strides):
+    # Check that the passed in strides will not go outside of the
+    # memory of the array.  It is assumed that the strides are of the
+    # proper length.
+    cdef ssize_t max_axis_offset
+    cdef size_t lower = a.ga.offset
+    cdef size_t upper = a.ga.offset
+    cdef size_t itemsize = gpuarray_get_elsize(a.ga.typecode)
+    cdef size_t size
+    cdef unsigned int i
+
+    gpudata_property(a.ga.data, GA_BUFFER_PROP_SIZE, &size)
+
+    for i in range(a.ga.nd):
+        if a.ga.dimensions[i] == 0:
+            return 1
+
+        max_axis_offset = <ssize_t>(strides[i]) * <ssize_t>(a.ga.dimensions[i] - 1)
+        if max_axis_offset > 0:
+            if upper + max_axis_offset > size:
+                return 0
+            upper += max_axis_offset
+        else:
+            if lower < <size_t>(-max_axis_offset):
+                return 0
+            lower += max_axis_offset
+    return (upper + itemsize) <= size
+
+
 class GpuArrayException(Exception):
     """
     Exception used for most errors related to libgpuarray.
@@ -234,6 +289,9 @@ cdef bint py_CHKFLAGS(GpuArray a, int flags):
 cdef bint py_ISONESEGMENT(GpuArray a):
     return GpuArray_ISONESEGMENT(&a.ga)
 
+cdef void array_fix_flags(GpuArray a):
+    GpuArray_fix_flags(&a.ga)
+
 cdef int array_empty(GpuArray a, gpucontext *ctx,
                      int typecode, unsigned int nd, const size_t *dims,
                      ga_order ord) except -1:
@@ -252,17 +310,6 @@ cdef int array_fromdata(GpuArray a,
     if err != GA_NO_ERROR:
         raise get_exc(err), gpucontext_error(gpudata_context(data), err)
 
-cdef int array_copy_from_host(GpuArray a,
-                              gpucontext *ctx, void *buf, int typecode,
-                              unsigned int nd, const size_t *dims,
-                              const ssize_t *strides) except -1:
-    cdef int err
-    with nogil:
-        err = GpuArray_copy_from_host(&a.ga, ctx, buf, typecode, nd, dims,
-                                      strides);
-    if err != GA_NO_ERROR:
-        raise get_exc(err), gpucontext_error(ctx, err)
-
 cdef int array_view(GpuArray v, GpuArray a) except -1:
     cdef int err
     err = GpuArray_view(&v.ga, &a.ga)
@@ -289,7 +336,7 @@ cdef int array_take1(GpuArray r, GpuArray a, GpuArray i,
     err = GpuArray_take1(&r.ga, &a.ga, &i.ga, check_err)
     if err != GA_NO_ERROR:
         if err == GA_VALUE_ERROR:
-            raise IndexError, "Index out of bounds"
+            raise IndexError, GpuArray_error(&r.ga, err)
         raise get_exc(err), GpuArray_error(&r.ga, err)
 
 cdef int array_setarray(GpuArray v, GpuArray a) except -1:
@@ -409,22 +456,16 @@ cdef gpucontext *kernel_context(GpuKernel k) except NULL:
         raise GpuArrayException, "Invalid kernel or destroyed context"
     return res
 
-cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1:
-    cdef int err
-    err = GpuKernel_sched(&k.k, n, ls, gs)
-    if err != GA_NO_ERROR:
-        raise get_exc(err), kernel_error(k, err)
-
-cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *ls,
-                     const size_t *gs, size_t shared, void **args) except -1:
+cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1:
     cdef int err
-    err = GpuKernel_call(&k.k, n, ls, gs, shared, args)
+    err = GpuKernel_sched(&k.k, n, gs, ls)
     if err != GA_NO_ERROR:
         raise get_exc(err), kernel_error(k, err)
 
-cdef int kernel_binary(GpuKernel k, size_t *sz, void **bin) except -1:
+cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs,
+                     const size_t *ls, size_t shared, void **args) except -1:
     cdef int err
-    err = GpuKernel_binary(&k.k, sz, bin)
+    err = GpuKernel_call(&k.k, n, gs, ls, shared, args)
     if err != GA_NO_ERROR:
         raise get_exc(err), kernel_error(k, err)
 
@@ -451,10 +492,6 @@ def set_default_context(GpuContext ctx):
 
     Set the default context for the module.
 
-    :param ctx: default context
-    :type ctx: GpuContext
-    :rtype: None
-
     The provided context will be used as a default value for all the
     other functions in this module which take a context as parameter.
     Call with `None` to clear the default value.
@@ -465,6 +502,12 @@ def set_default_context(GpuContext ctx):
     This can be helpful to reduce clutter when working with only one
     context. It is strongly discouraged to use this function when
     working with multiple contexts at once.
+
+    Parameters
+    ----------
+    ctx: GpuContext
+        default context
+
     """
     global default_context
     default_context = ctx
@@ -489,7 +532,10 @@ cdef bint pygpu_GpuArray_Check(object o):
     return isinstance(o, GpuArray)
 
 def count_platforms(kind):
-    """Return number of host's platforms compatible with `kind`.
+    """
+    count_platforms(kind)
+
+    Return number of host's platforms compatible with `kind`.
     """
     cdef unsigned int platcount
     cdef int err
@@ -499,7 +545,10 @@ def count_platforms(kind):
     return platcount
 
 def count_devices(kind, unsigned int platform):
-    """Returns number of devices in host's `platform` compatible with `kind`.
+    """
+    count_devices(kind, platform)
+
+    Returns number of devices in host's `platform` compatible with `kind`.
     """
     cdef unsigned int devcount
     cdef int err
@@ -508,13 +557,17 @@ def count_devices(kind, unsigned int platform):
         raise get_exc(err), gpucontext_error(NULL, err)
     return devcount
 
-cdef GpuContext pygpu_init(dev, int flags):
+cdef GpuContext pygpu_init(dev, gpucontext_props *p):
+    cdef int err
+    cdef GpuContext res
+
     if dev.startswith('cuda'):
         kind = b"cuda"
         if dev[4:] == '':
             devnum = -1
         else:
             devnum = int(dev[4:])
+        gpucontext_props_cuda_dev(p, devnum)
     elif dev.startswith('opencl'):
         kind = b"opencl"
         devspec = dev[6:].split(':')
@@ -523,27 +576,25 @@ cdef GpuContext pygpu_init(dev, int flags):
         if not devspec[0].isdigit() or not devspec[1].isdigit():
             raise ValueError, "OpenCL name incorrect. Should be opencl<int>:<int> instead got: " + dev
         else:
-            devnum = int(devspec[0]) << 16 | int(devspec[1])
+            gpucontext_props_opencl_dev(p, int(devspec[0]), int(devspec[1]))
     else:
         raise ValueError, "Unknown device format:" + dev
-    return GpuContext(kind, devnum, flags)
 
-def init(dev, sched='default', disable_alloc_cache=False, single_stream=False):
+    res = GpuContext.__new__(GpuContext)
+    res.kind = kind
+    err = gpucontext_init(&res.ctx, <char *>res.kind, p)
+    if err != GA_NO_ERROR:
+        raise get_exc(err), gpucontext_error(NULL, err)
+    return res
+
+def init(dev, sched='default', single_stream=False, kernel_cache_path=None,
+         max_cache_size=sys.maxsize, initial_cache_size=0):
     """
-    init(dev, sched='default', disable_alloc_cache=False, single_stream=False)
+    init(dev, sched='default', single_stream=False, kernel_cache_path=None,
+         max_cache_size=sys.maxsize, initial_cache_size=0)
 
     Creates a context from a device specifier.
 
-    :param dev: device specifier
-    :type dev: string
-    :param sched: optimize scheduling for which type of operation
-    :type sched: {'default', 'single', 'multi'}
-    :param disable_alloc_cache: disable allocation cache (if any)
-    :type disable_alloc_cache: bool
-    :param single_stream: enable single stream mode
-    :type single_stream: bool
-    :rtype: GpuContext
-
     Device specifiers are composed of the type string and the device
     id like so::
 
@@ -562,25 +613,49 @@ def init(dev, sched='default', disable_alloc_cache=False, single_stream=False):
     list available platforms and devices.  You can experiement with
     the values, unavaiable ones will just raise an error, and there
     are no gaps in the valid numbers.
+
+    Parameters
+    ----------
+    dev: str
+        device specifier
+    sched: {'default', 'single', 'multi'}
+        optimize scheduling for which type of operation
+    disable_alloc_cache: bool
+        disable allocation cache (if any)
+    single_stream: bool
+        enable single stream mode
+
     """
-    cdef int flags = 0
-    expected_version = -9997
-    if gpuarray_api_major != expected_version or gpuarray_api_minor < 0:
-        raise RuntimeError(
-            "Pygpu was expecting libgpuarray version %d, but %d is available. "
-            "Recompile it to avoid problems.",
-            expected_version, gpuarray_api_major)
-    if sched == 'single':
-        flags |= GA_CTX_SINGLE_THREAD
-    elif sched == 'multi':
-        flags |= GA_CTX_MULTI_THREAD
-    elif sched != 'default':
-        raise TypeError('unexpected value for parameter sched: %s' % (sched,))
-    if disable_alloc_cache:
-        flags |= GA_CTX_DISABLE_ALLOCATION_CACHE
-    if single_stream:
-        flags |= GA_CTX_SINGLE_STREAM
-    return pygpu_init(dev, flags)
+    cdef gpucontext_props *p = NULL
+    cdef int err
+    cdef bytes kernel_cache_path_b
+    err = gpucontext_props_new(&p)
+    if err != GA_NO_ERROR:
+        raise MemoryError
+    try:
+        if sched == 'single':
+            err = gpucontext_props_sched(p, GA_CTX_SCHED_SINGLE)
+        elif sched == 'multi':
+            err = gpucontext_props_sched(p, GA_CTX_SCHED_MULTI)
+        elif sched != 'default':
+            raise TypeError('unexpected value for parameter sched: %s' % (sched,))
+        if err != GA_NO_ERROR:
+            raise get_exc(err), gpucontext_error(NULL, err)
+
+        if kernel_cache_path:
+            kernel_cache_path_b = _s(kernel_cache_path)
+            gpucontext_props_kernel_cache(p, <const char *>kernel_cache_path_b)
+
+        err = gpucontext_props_alloc_cache(p, initial_cache_size,
+                                           max_cache_size)
+        if err != GA_NO_ERROR:
+            raise get_exc(err), gpucontext_error(NULL, err)
+        if single_stream:
+            gpucontext_props_set_single_stream(p);
+    except:
+        gpucontext_props_del(p)
+        raise
+    return pygpu_init(dev, p)
 
 def zeros(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None,
           cls=None):
@@ -590,17 +665,19 @@ def zeros(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None,
     Returns an array of zero-initialized values of the requested
     shape, type and order.
 
-    :param shape: number of elements in each dimension
-    :type shape: iterable of ints
-    :param dtype: type of the elements
-    :type dtype: string, numpy.dtype or int
-    :param order: layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran
-    :type order: string
-    :param context: context in which to do the allocation
-    :type context: GpuContext
-    :param cls: class of the returned array (must inherit from GpuArray)
-    :type cls: class
-    :rtype: array
+    Parameters
+    ----------
+    shape: iterable of ints
+        number of elements in each dimension
+    dtype: str, numpy.dtype or int
+        type of the elements
+    order: {'A', 'C', 'F'}
+        layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran
+    context: GpuContext
+        context in which to do the allocation
+    cls: type
+        class of the returned array (must inherit from GpuArray)
+
     """
     res = empty(shape, dtype=dtype, order=order, context=context, cls=cls)
     array_memset(res, 0)
@@ -623,17 +700,6 @@ cdef GpuArray pygpu_empty(unsigned int nd, const size_t *dims, int typecode,
     array_empty(res, context.ctx, typecode, nd, dims, order)
     return res
 
-cdef GpuArray pygpu_fromhostdata(void *buf, int typecode, unsigned int nd,
-                                 const size_t *dims, const ssize_t *strides,
-                                 GpuContext context, object cls):
-    cdef GpuArray res
-    context = ensure_context(context)
-
-    res = new_GpuArray(cls, context, None)
-    array_copy_from_host(res, context.ctx, buf, typecode, nd,
-                         dims, strides)
-    return res
-
 cdef GpuArray pygpu_fromgpudata(gpudata *buf, size_t offset, int typecode,
                                 unsigned int nd, const size_t *dims,
                                 const ssize_t *strides, GpuContext context,
@@ -664,17 +730,19 @@ def empty(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None,
     Returns an empty (uninitialized) array of the requested shape,
     type and order.
 
-    :param shape: number of elements in each dimension
-    :type shape: iterable of ints
-    :param dtype: type of the elements
-    :type dtype: string, numpy.dtype or int
-    :param order: layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran
-    :type order: string
-    :param context: context in which to do the allocation
-    :type context: GpuContext
-    :param cls: class of the returned array (must inherit from GpuArray)
-    :type cls: class
-    :rtype: array
+    Parameters
+    ----------
+    shape: iterable of ints
+        number of elements in each dimension
+    dtype: str, numpy.dtype or int
+        type of the elements
+    order: {'A', 'C', 'F'}
+        layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran
+    context: GpuContext
+        context in which to do the allocation
+    cls: type
+        class of the returned array (must inherit from GpuArray)
+
     """
     cdef size_t *cdims
     cdef unsigned int nd
@@ -703,16 +771,6 @@ def asarray(a, dtype=None, order='A', GpuContext context=None):
 
     Returns a GpuArray from the data in `a`
 
-    :param a: data
-    :type shape: array-like
-    :param dtype: type of the elements
-    :type dtype: string, numpy.dtype or int
-    :param order: layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran
-    :type order: string or int
-    :param context: context in which to do the allocation
-    :type context: GpuContext
-    :rtype: GpuArray
-
     If `a` is already a GpuArray and all other parameters match, then
     the object itself returned.  If `a` is an instance of a subclass
     of GpuArray then a view of the base class will be returned.
@@ -720,6 +778,18 @@ def asarray(a, dtype=None, order='A', GpuContext context=None):
 
     `context` is optional if `a` is a GpuArray (but must match exactly
     the context of `a` if specified) and is mandatory otherwise.
+
+    Parameters
+    ----------
+    a: array-like
+        data
+    dtype: str, numpy.dtype or int
+        type of the elements
+    order: {'A', 'C', 'F'}
+        layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran
+    context: GpuContext
+        context in which to do the allocation
+
     """
     return array(a, dtype=dtype, order=order, copy=False, context=context,
                  cls=GpuArray)
@@ -730,16 +800,18 @@ def ascontiguousarray(a, dtype=None, GpuContext context=None):
 
     Returns a contiguous array in device memory (C order).
 
-    :param a: input
-    :type a: array-like
-    :param dtype: type of the return array
-    :type dtype: string, numpy.dtype or int
-    :param context: context to use for a new array
-    :type context: GpuContext
-    :rtype: array
-
     `context` is optional if `a` is a GpuArray (but must match exactly
     the context of `a` if specified) and is mandatory otherwise.
+
+    Parameters
+    ----------
+    a: array-like
+        input
+    dtype: str, numpy.dtype or int
+        type of the return array
+    context: GpuContext
+        context to use for a new array
+
     """
     return array(a, order='C', dtype=dtype, ndmin=1, copy=False,
                  context=context)
@@ -750,16 +822,18 @@ def asfortranarray(a, dtype=None, GpuArray context=None):
 
     Returns a contiguous array in device memory (Fortran order)
 
-    :param a: input
-    :type a: array-like
-    :param dtype: type of the elements
-    :type dtype: string, numpy.dtype or int
-    :param context: context in which to do the allocation
-    :type context: GpuContext
-    :rtype: array
-
     `context` is optional if `a` is a GpuArray (but must match exactly
     the context of `a` if specified) and is mandatory otherwise.
+
+    Parameters
+    ----------
+    a: array-like
+        input
+    dtype: str, numpy.dtype or int
+        type of the elements
+    context: GpuContext
+        context in which to do the allocation
+
     """
     return array(a, order='F', dtype=dtype, ndmin=1, copy=False,
                  context=context)
@@ -779,33 +853,39 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None,
 
     Build a GpuArray from pre-allocated gpudata
 
-    :param data: pointer to a gpudata structure
-    :type data: int
-    :param offset: offset to the data location inside the gpudata
-    :type offset: int
-    :param dtype: data type of the gpudata elements
-    :type dtype: numpy.dtype
-    :param shape: shape to use for the result
-    :type shape: iterable of ints
-    :param context: context of the gpudata
-    :type context: GpuContext
-    :param strides: strides for the results
-    :type strides: iterable of ints
-    :param writable: is the data writable?
-    :type writeable: bool
-    :param base: base object that keeps gpudata alive
-    :param cls: view type of the result
+    Parameters
+    ----------
+    data: int
+        pointer to a gpudata structure
+    offset: int
+        offset to the data location inside the gpudata
+    dtype: numpy.dtype
+        data type of the gpudata elements
+    shape: iterable of ints
+        shape to use for the result
+    context: GpuContext
+        context of the gpudata
+    strides: iterable of ints
+        strides for the results (C contiguous if not specified)
+    writable: bool
+        is the data writable?
+    base: object
+        base object that keeps gpudata alive
+    cls: type
+        view type of the result
+
+    Notes
+    -----
+    This function might be deprecated in a later release since the only
+    way to create gpudata pointers is through libgpuarray functions
+    that aren't exposed at the python level. It can be used with the
+    value of the `gpudata` attribute of an existing GpuArray.
 
     .. warning::
         This function is intended for advanced use and will crash the
         interpreter if used improperly.
 
-    .. note::
-        This function might be deprecated in a later relase since the
-        only way to create gpudata pointers is through libgpuarray
-        functions that aren't exposed at the python level. It can be
-        used with the value of the `gpudata` attribute of an existing
-        GpuArray.
+
     """
     cdef size_t *cdims = NULL
     cdef ssize_t *cstrides = NULL
@@ -839,7 +919,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None,
         else:
             size = gpuarray_get_elsize(typecode)
             for i in range(nd-1, -1, -1):
-                strides[i] = size
+                cstrides[i] = size
                 size *= cdims[i]
 
         return pygpu_fromgpudata(<gpudata *>data, offset, typecode, nd, cdims,
@@ -848,44 +928,49 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None,
         free(cdims)
         free(cstrides)
 
-def array(proto, dtype=None, copy=True, order=None, int ndmin=0,
+def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0,
           GpuContext context=None, cls=None):
     """
     array(obj, dtype='float64', copy=True, order=None, ndmin=0, context=None, cls=None)
 
     Create a GpuArray from existing data
 
-    :param obj: data to initialize the result
-    :type obj: array-like
-    :param dtype: data type of the result elements
-    :type dtype: string or numpy.dtype or int
-    :param copy: return a copy?
-    :type copy: bool
-    :param order: memory layout of the result
-    :type order: string
-    :param ndmin: minimum number of result dimensions
-    :type ndmin: int
-    :param context: allocation context
-    :type context: GpuContext
-    :param cls: result class (must inherit from GpuArray)
-    :type cls: class
-    :rtype: GpuArray
-
     This function creates a new GpuArray from the data provided in
     `obj` except if `obj` is already a GpuArray and all the parameters
     match its properties and `copy` is False.
 
     The properties of the resulting array depend on the input data
-    except if overriden by other parameters.
+    except if overridden by other parameters.
 
     This function is similar to :meth:`numpy.array` except that it returns
     GpuArrays.
+
+    Parameters
+    ----------
+    obj: array-like
+        data to initialize the result
+    dtype: string or numpy.dtype or int
+        data type of the result elements
+    copy: bool
+        return a copy?
+    order: str
+        memory layout of the result
+    ndmin: int
+        minimum number of result dimensions
+    context: GpuContext
+        allocation context
+    cls: type
+        result class (must inherit from GpuArray)
+
     """
+    return carray(proto, dtype, copy, order, ndmin, context, cls)
+
+cdef carray(proto, dtype, copy, order, unsigned int ndmin,
+            GpuContext context, cls):
     cdef GpuArray res
     cdef GpuArray arg
     cdef GpuArray tmp
     cdef np.ndarray a
-    cdef ga_order ord
 
     if isinstance(proto, GpuArray):
         arg = proto
@@ -929,12 +1014,18 @@ def array(proto, dtype=None, copy=True, order=None, int ndmin=0,
 
     context = ensure_context(context)
 
+    # We need a contiguous array for the copy
+    if order != 'C' and order != 'F':
+        order = 'C'
+
     a = numpy.array(proto, dtype=dtype_to_npdtype(dtype), order=order,
                     ndmin=ndmin, copy=False)
 
-    return pygpu_fromhostdata(np.PyArray_DATA(a), dtype_to_typecode(a.dtype),
-                              np.PyArray_NDIM(a), <size_t *>np.PyArray_DIMS(a),
-                              <ssize_t *>np.PyArray_STRIDES(a), context, cls)
+    res = pygpu_empty(np.PyArray_NDIM(a), <size_t *>np.PyArray_DIMS(a),
+                      dtype_to_typecode(a.dtype), to_ga_order(order),
+                      context, cls)
+    array_write(res, np.PyArray_DATA(a), np.PyArray_NBYTES(a))
+    return res
 
 cdef void (*cuda_enter)(gpucontext *)
 cdef void (*cuda_exit)(gpucontext *)
@@ -946,17 +1037,6 @@ cdef class GpuContext:
     """
     Class that holds all the information pertaining to a context.
 
-    .. code-block:: python
-
-        GpuContext(kind, devno, flags)
-
-    :param kind: module name for the context
-    :type kind: string
-    :param devno: device number
-    :type devno: int
-    :param flags: context flags
-    :type flags: int
-
     The currently implemented modules (for the `kind` parameter) are
     "cuda" and "opencl".  Which are available depends on the build
     options for libgpuarray.
@@ -966,6 +1046,16 @@ cdef class GpuContext:
     one value you must bitwise OR them together.
 
     If you want an alternative interface check :meth:`~pygpu.gpuarray.init`.
+
+    Parameters
+    ----------
+    kind: str
+        module name for the context
+    devno: int
+        device number
+    flags: int
+        context flags
+
     """
     def __dealloc__(self):
         if self.ctx != NULL:
@@ -974,16 +1064,9 @@ cdef class GpuContext:
     def __reduce__(self):
         raise RuntimeError, "Cannot pickle GpuContext object"
 
-    def __cinit__(self, bytes kind, devno, int flags):
-        cdef int err = GA_NO_ERROR
-        cdef gpucontext *ctx
-        self.kind = kind
-        self.ctx = gpucontext_init(<char *>self.kind, devno, flags, &err)
-        if (err != GA_NO_ERROR):
-            if err == GA_VALUE_ERROR:
-                raise get_exc(err), "No device %d"%(devno,)
-            else:
-                raise get_exc(err), gpucontext_error(NULL, err).decode('utf-8') + ": " + str(devno)
+    def __init__(self):
+        if type(self) is GpuContext:
+            raise RuntimeError, "Called raw GpuContext.__init__"
 
     def __enter__(self):
         if cuda_enter == NULL:
@@ -1006,22 +1089,18 @@ cdef class GpuContext:
     property devname:
         "Device name for this context"
         def __get__(self):
-            cdef char *tmp
-            cdef unicode res
+            cdef char tmp[256]
 
-            ctx_property(self, GA_CTX_PROP_DEVNAME, &tmp)
-            try:
-                res = tmp.decode('ascii')
-            finally:
-                free(tmp)
-            return res
+            ctx_property(self, GA_CTX_PROP_DEVNAME, tmp)
+            return tmp.decode('ascii')
 
-    property maxlsize:
-        "Maximum size of thread block (local size) for this context"
+    property unique_id:
+        "Device PCI Bus ID for this context"
         def __get__(self):
-            cdef size_t res
-            ctx_property(self, GA_CTX_PROP_MAXLSIZE, &res)
-            return res
+            cdef char tmp[16]
+
+            ctx_property(self, GA_CTX_PROP_UNIQUE_ID, tmp)
+            return tmp.decode('ascii')
 
     property lmemsize:
         "Size of the local (shared) memory, in bytes, for this context"
@@ -1037,13 +1116,6 @@ cdef class GpuContext:
             ctx_property(self, GA_CTX_PROP_NUMPROCS, &res)
             return res
 
-    property maxgsize:
-        "Maximum group size for kernel calls"
-        def __get__(self):
-            cdef size_t res
-            ctx_property(self, GA_CTX_PROP_MAXGSIZE, &res)
-            return res
-
     property bin_id:
         "Binary compatibility id"
         def __get__(self):
@@ -1107,6 +1179,13 @@ cdef class GpuContext:
             ctx_property(self, GA_CTX_PROP_MAXGSIZE2, &res)
             return res
 
+    property largest_memblock:
+        "Size of the largest memory block you can allocate"
+        def __get__(self):
+            cdef size_t res
+            ctx_property(self, GA_CTX_PROP_LARGEST_MEMBLOCK, &res)
+            return res
+
 
 cdef class flags(object):
     cdef int fl
@@ -1293,8 +1372,8 @@ cdef GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode):
     cdef GpuArray res
 
     if ord == GA_ANY_ORDER:
-        if py_CHKFLAGS(a, GA_F_CONTIGUOUS) and \
-                not py_CHKFLAGS(a, GA_C_CONTIGUOUS):
+        if (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and
+                not py_CHKFLAGS(a, GA_C_CONTIGUOUS)):
             ord = GA_F_ORDER
         else:
             ord = GA_C_ORDER
@@ -1308,14 +1387,20 @@ cdef GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode):
     return res
 
 cdef np.ndarray pygpu_as_ndarray(GpuArray a):
+    return _pygpu_as_ndarray(a, None)
+
+cdef np.ndarray _pygpu_as_ndarray(GpuArray a, np.dtype ldtype):
     cdef np.ndarray res
 
     if not py_ISONESEGMENT(a):
         a = pygpu_copy(a, GA_ANY_ORDER)
 
+    if ldtype is None:
+        ldtype = a.dtype
+
     res = PyArray_Empty(a.ga.nd, <np.npy_intp *>a.ga.dimensions,
-                        a.dtype, (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and
-                                  not py_CHKFLAGS(a, GA_C_CONTIGUOUS)))
+                        ldtype, (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and
+                                 not py_CHKFLAGS(a, GA_C_CONTIGUOUS)))
 
     array_read(np.PyArray_DATA(res), np.PyArray_NBYTES(res), a)
 
@@ -1338,29 +1423,35 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims,
     if compute_axis < 0:
         array_reshape(res, a, nd, newdims, ord, nocopy)
         return res
-    if compute_axis >= nd:
-        raise ValueError("You wanted us to compute the shape of a dimensions that don't exist")
+    cdef unsigned int caxis = <unsigned int>compute_axis
+    if caxis >= nd:
+        raise ValueError("compute_axis is out of bounds")
 
     cdef size_t *cdims
     cdef size_t tot = 1
+    cdef unsigned int i
     for i in range(nd):
-        if i != compute_axis:
+        if i != caxis:
             tot *= newdims[i]
     cdims = <size_t *>calloc(nd, sizeof(size_t))
     if cdims == NULL:
         raise MemoryError, "could not allocate cdims"
 
-    for i in range(nd):
-        d = newdims[i]
-        if i == compute_axis:
-            d = a.size // tot
+    cdef size_t d
+    try:
+        for i in range(nd):
+            d = newdims[i]
+            if i == caxis:
+                d = a.size // tot
 
-            if d * tot != a.size:
-                raise GpuArrayException, "..."
-        cdims[i] = d
+                if d * tot != a.size:
+                    raise GpuArrayException, "..."
+            cdims[i] = d
 
-    array_reshape(res, a, nd, cdims, ord, nocopy)
-    return res
+        array_reshape(res, a, nd, cdims, ord, nocopy)
+        return res
+    finally:
+        free(cdims)
 
 
 cdef GpuArray pygpu_transpose(GpuArray a, const unsigned int *newaxes):
@@ -1374,6 +1465,9 @@ cdef int pygpu_transfer(GpuArray res, GpuArray a) except -1:
     return 0
 
 def _split(GpuArray a, ind, unsigned int axis):
+    """
+    _split(a, ind, axis)
+    """
     cdef list r = [None] * (len(ind) + 1)
     cdef Py_ssize_t i
     if not axis < a.ga.nd:
@@ -1410,6 +1504,9 @@ cdef GpuArray pygpu_concatenate(const _GpuArray **a, size_t n,
 
 def _concatenate(list al, unsigned int axis, int restype, object cls,
                  GpuContext context):
+    """
+    _concatenate(al, axis, restype, cls, context)
+    """
     cdef Py_ssize_t i
     context = ensure_context(context)
     cdef const _GpuArray **als = <const _GpuArray **>PyMem_Malloc(sizeof(_GpuArray *) * len(al))
@@ -1424,13 +1521,47 @@ def _concatenate(list al, unsigned int axis, int restype, object cls,
     finally:
         PyMem_Free(als)
 
+cdef int (*cuda_get_ipc_handle)(gpudata *, GpuArrayIpcMemHandle *)
+cdef gpudata *(*cuda_open_ipc_handle)(gpucontext *, GpuArrayIpcMemHandle *, size_t)
+
+cuda_get_ipc_handle = <int (*)(gpudata *, GpuArrayIpcMemHandle *)>gpuarray_get_extension("cuda_get_ipc_handle")
+cuda_open_ipc_handle = <gpudata *(*)(gpucontext *, GpuArrayIpcMemHandle *, size_t)>gpuarray_get_extension("cuda_open_ipc_handle")
+
+def open_ipc_handle(GpuContext c, bytes hpy, size_t l):
+    """
+    open_ipc_handle(c, hpy, l)
+
+    Open an IPC handle to get a new GpuArray from it.
+
+    Parameters
+    ----------
+    c: GpuContext
+        context
+    hpy: bytes
+        binary handle data received
+    l: int
+        size of the referred memory block
+
+    """
+    cdef char *b
+    cdef GpuArrayIpcMemHandle h
+    cdef gpudata *d
+
+    b = hpy
+    memcpy(&h, b, sizeof(h))
+
+    d = cuda_open_ipc_handle(c.ctx, &h, l)
+    if d is NULL:
+        raise GpuArrayException, gpucontext_error(c.ctx, 0)
+    return <size_t>d
+
 cdef class GpuArray:
     """
     Device array
 
     To create instances of this class use
     :meth:`~pygpu.gpuarray.zeros`, :meth:`~pygpu.gpuarray.empty` or
-    :meth:`~pygpu.gpuarray.array`.  It cannot be instanciated
+    :meth:`~pygpu.gpuarray.array`.  It cannot be instantiated
     directly.
 
     You can also subclass this class and make the module create your
@@ -1464,7 +1595,7 @@ cdef class GpuArray:
             k = PyNumber_Index(key)
             if k < 0:
                 k += self.ga.dimensions[i]
-            if k < 0 or k >= self.ga.dimensions[i]:
+            if k < 0 or (<size_t>k) >= self.ga.dimensions[i]:
                 raise IndexError, "index %d out of bounds" % (i,)
             start[0] = k
             step[0] = 0
@@ -1473,9 +1604,7 @@ cdef class GpuArray:
             pass
 
         if isinstance(key, slice):
-            # C compiler complains about argument 1 (key) because it's
-            # declared as a PyObject.  But we know it's a slice so it's ok.
-            PySlice_GetIndicesEx(<slice_object>key, self.ga.dimensions[i],
+            PySlice_GetIndicesEx(key, self.ga.dimensions[i],
                                  start, stop, step, &dummy)
             if stop[0] < start[0] and step[0] > 0:
                 stop[0] = start[0]
@@ -1487,7 +1616,10 @@ cdef class GpuArray:
             raise IndexError, "cannot index with: %s" % (key,)
 
     def write(self, np.ndarray src not None):
-        """Writes host's Numpy array to device's GpuArray.
+        """
+        write(src)
+
+        Writes host's Numpy array to device's GpuArray.
 
         This method is as fast as or even faster than :ref:asarray, because it
         skips possible allocation of a buffer in device's memory. It uses this
@@ -1499,11 +1631,16 @@ cdef class GpuArray:
         to be. It is allowed for this GpuArray and `src` to have different
         shapes.
 
-        :param src: source array in host
-        :type src: np.ndarray
+        Parameters
+        ----------
+        src: numpy.ndarray
+            source array in host
 
-        :raises ValueError: If this GpuArray is not compatible with `src` or
-            if it is not well behaved or contiguous.
+        Raises
+        ------
+        ValueError
+            If this GpuArray is not compatible with `src` or if it is
+            not well behaved or contiguous.
 
         """
         if not self.flags.behaved:
@@ -1526,7 +1663,10 @@ cdef class GpuArray:
         array_write(self, np.PyArray_DATA(src), sz)
 
     def read(self, np.ndarray dst not None):
-        """Reads from this GpuArray into host's Numpy array.
+        """
+        read(dst)
+
+        Reads from this GpuArray into host's Numpy array.
 
         This method is as fast as or even faster than :ref:__array__ method and
         thus :ref:numpy.asarray. This is because it skips allocation of a new
@@ -1538,17 +1678,24 @@ cdef class GpuArray:
         contiguous. It is allowed for this GpuArray and `dst` to have different
         shapes.
 
-        :param dst: destination array in host
-        :type dst: np.ndarray
+        Parameters
+        ----------
+        dst: numpy.ndarray
+            destination array in host
 
-        :raises ValueError: If this GpuArray is not compatible with `src` or
-            if `dst` is not well behaved.
+        Raises
+        ------
+        ValueError
+            If this GpuArray is not compatible with `src` or if `dst`
+            is not well behaved.
 
         """
         if not np.PyArray_ISBEHAVED(dst):
             raise ValueError, "Destination Numpy array is not well behaved: aligned and writeable"
-        if not ((self.flags.c_contiguous and self.flags.aligned and dst.flags['C_CONTIGUOUS']) or \
-                (self.flags.f_contiguous and self.flags.aligned and dst.flags['F_CONTIGUOUS'])):
+        if (not ((self.flags.c_contiguous and self.flags.aligned and
+                  dst.flags['C_CONTIGUOUS']) or
+                (self.flags.f_contiguous and self.flags.aligned and
+                 dst.flags['F_CONTIGUOUS']))):
             raise ValueError, "GpuArray and Numpy array do not match in contiguity or GpuArray is not aligned"
         if self.dtype != dst.dtype:
             raise ValueError, "GpuArray and Numpy array do not have matching data types"
@@ -1561,15 +1708,42 @@ cdef class GpuArray:
             raise ValueError, "GpuArray and Numpy array do not have the same size in bytes"
         array_read(np.PyArray_DATA(dst), sz, self)
 
-    def __array__(self):
+    def get_ipc_handle(self):
+        """
+        get_ipc_handle()
+        """
+        cdef GpuArrayIpcMemHandle h
+        cdef int err
+        if cuda_get_ipc_handle is NULL:
+            raise SystemError, "Could not get necessary extension"
+        if self.context.kind != b'cuda':
+            raise ValueError, "Only works for cuda contexts"
+        err = cuda_get_ipc_handle(self.ga.data, &h)
+        if err != GA_NO_ERROR:
+            raise get_exc(err), GpuArray_error(&self.ga, err)
+        res = <bytes>(<char *>&h)[:sizeof(h)]
+        return res
+
+    def __array__(self, ldtype=None):
         """
-        __array__()
+        __array__(ldtype=None)
 
         Return a :class:`numpy.ndarray` with the same content.
 
         Automatically used by :meth:`numpy.asarray`.
         """
-        return pygpu_as_ndarray(self)
+        return _pygpu_as_ndarray(self, ldtype)
+
+    def __bool__(self):
+        """
+        __bool__()
+        """
+        if self.size == 0:
+            return False
+        elif self.size == 1:
+            return bool(numpy.asarray(self))
+        else:
+            raise ValueError('The truth value of a multi-element array is ambiguous')
 
     def _empty_like_me(self, dtype=None, order='C'):
         """
@@ -1594,12 +1768,18 @@ cdef class GpuArray:
 
         Return a copy if this array.
 
-        :param order: memory layout of the copy
-        :type order: string
+        Parameters
+        ----------
+        order: {'C', 'A', 'F'}
+            memory layout of the copy
+
         """
         return pygpu_copy(self, to_ga_order(order))
 
     def transfer(self, GpuContext new_ctx):
+        """
+        transfer(new_ctx)
+        """
         cdef GpuArray r
         if not GpuArray_ISONESEGMENT(&self.ga):
             # For now raise an error, may make it work later
@@ -1636,10 +1816,14 @@ cdef class GpuArray:
 
         Return a view of this array.
 
-        :param cls: class of the view (must inherit from GpuArray)
-
         The returned array shares device data with this one and both
         will reflect changes made to the other.
+
+        Parameters
+        ----------
+        cls: type
+            class of the view (must inherit from GpuArray)
+
         """
         return pygpu_view(self, cls)
 
@@ -1649,18 +1833,21 @@ cdef class GpuArray:
 
         Cast the elements of this array to a new type.
 
-        :param dtype: type of the elements of the result
-        :type dtype: string or numpy.dtype or int
-        :param order: memory layout of the result
-        :type order: string
-        :param copy: Always return a copy?
-        :type copy: bool
-
         This function returns a new array will all elements cast to
         the supplied `dtype`, but otherwise unchanged.
 
         If `copy` is False and the type and order match `self` is
         returned.
+
+        Parameters
+        ----------
+        dtype: str or numpy.dtype or int
+            type of the elements of the result
+        order: {'A', 'C', 'F'}
+            memory layout of the result
+        copy: bool
+            Always return a copy?
+
         """
         cdef GpuArray res
         cdef int typecode = dtype_to_typecode(dtype)
@@ -1712,6 +1899,9 @@ cdef class GpuArray:
             free(newdims)
 
     def transpose(self, *params):
+        """
+        transpose(*params)
+        """
         cdef unsigned int *new_axes
         cdef unsigned int i
         if len(params) is 1 and isinstance(params[0], (tuple, list)):
@@ -1736,6 +1926,85 @@ cdef class GpuArray:
             raise TypeError, "len() of unsized object"
 
     def __getitem__(self, key):
+        cdef unsigned int i
+
+        if key is Ellipsis:
+            return self.__cgetitem__(key)
+
+        # A list or a sequence of list should trigger "fancy" indexing.
+        # This is not implemented yet.
+        # Conversely, if a list contains slice or Ellipsis objects, it behaves
+        # the same as a tuple.
+        if isinstance(key, list):
+            if any(isinstance(k, slice) or k is Ellipsis for k in key):
+                return self.__getitem__(tuple(key))
+            else:
+                raise NotImplementedError, "fancy indexing not supported"
+
+        try:
+            iter(key)
+        except TypeError:
+            key = (key,)
+        else:
+            if all(isinstance(k, list) for k in key):
+                raise NotImplementedError, "fancy indexing not supported"
+
+            key = tuple(key)
+
+        # Need to massage Ellipsis here, to avoid packing it into a tuple.
+        if countis(key, Ellipsis) > 1:
+            raise IndexError, "cannot use more than one Ellipsis"
+
+        # The following code replaces an Ellipsis found in the key by
+        # the corresponding number of slice(None) objects, depending on the
+        # number of dimensions.  As example, this allows indexing on the last
+        # dimension with a[..., 1:] on any array (including 1-dim).  This
+        # is also required for numpy compat.
+        try:
+            ell_idx = key.index(Ellipsis)
+        except ValueError:
+            pass
+        else:
+            # Need number of axes minus missing dimensions extra slice(None)
+            # objects, not counting None entries and the Ellipsis itself
+            num_slcs = self.ga.nd - (len(key) - countis(key, None) - 1)
+            fill_slices = (slice(None),) * num_slcs
+            key = key[:ell_idx] + fill_slices + key[ell_idx + 1:]
+
+        # Remove the None entries for indexing
+        getitem_idcs = tuple(k for k in key if k is not None)
+
+        # For less than 1 index, fill up with slice(None) to the right.
+        # This allows indexing a[1:] in multi-dimensional arrays, where the
+        # slice is applied along the first axis only. It also allows
+        # a[()], which simply is a view in Numpy.
+        if len(getitem_idcs) <= 1:
+            getitem_idcs = (getitem_idcs +
+                            (slice(None),) * (self.ga.nd - len(getitem_idcs)))
+
+        # Slice into array, then reshape, accommodating for None entries in key
+        sliced = self.__cgetitem__(getitem_idcs)
+        if countis(key, None) == 0:
+            # Avoid unnecessary reshaping if there was no None
+            return sliced
+        else:
+            new_shape = []
+            i = 0
+            if sliced.shape:
+                for k in key:
+                    if isinstance(k, int):
+                        continue
+                    elif k is None:
+                        new_shape.append(1)
+                    else:
+                        new_shape.append(sliced.shape[i])
+                        i += 1
+            # Add remaining entries from sliced.shape if existing (happens
+            # for 1 index or less if ndim >= 2).
+            new_shape.extend(sliced.shape[i:])
+            return sliced.reshape(new_shape)
+
+    cdef __cgetitem__(self, key):
         cdef ssize_t *starts
         cdef ssize_t *stops
         cdef ssize_t *steps
@@ -1770,13 +2039,13 @@ cdef class GpuArray:
                     # is also required for numpy compat.
                     el = key.index(Ellipsis)
                     if isinstance(key, tuple):
-                        key = key[:el] + \
-                              (Ellipsis,)*(self.ga.nd - (len(key) - 1)) + \
-                              key[el+1:]
+                        key = (key[:el] +
+                               (Ellipsis,)*(self.ga.nd - (len(key) - 1)) +
+                               key[el+1:])
                     else:
-                        key = key[:el] + \
-                              [Ellipsis,]*(self.ga.nd - (len(key) - 1)) + \
-                              key[el+1:]
+                        key = (key[:el] +
+                               [Ellipsis,]*(self.ga.nd - (len(key) - 1)) +
+                               key[el+1:])
                 if len(key) > self.ga.nd:
                     raise IndexError, "too many indices"
                 for i in range(0, len(key)):
@@ -1793,19 +2062,43 @@ cdef class GpuArray:
                 steps[i] = 1
 
             return pygpu_index(self, starts, stops, steps)
+
         finally:
             free(starts)
             free(stops)
             free(steps)
 
     def __setitem__(self, idx, v):
-        cdef GpuArray tmp = self.__getitem__(idx)
-        cdef GpuArray gv = asarray(v, dtype=self.dtype,
-                                   context=self.context)
+        cdef GpuArray tmp, gv
+
+        if isinstance(idx, list):
+            if any(isinstance(i, slice) or i is Ellipsis for i in idx):
+                self.__setitem__(tuple(idx), v)
+            else:
+                raise NotImplementedError, "fancy indexing not supported"
+        try:
+            iter(idx)
+        except TypeError:
+            idx = (idx,)
+        else:
+            if all(isinstance(i, list) for i in idx):
+                raise NotImplementedError, "fancy indexing not supported"
 
+            idx = tuple(idx)
+
+        if countis(idx, Ellipsis) > 1:
+            raise IndexError, "cannot use more than one Ellipsis"
+
+        # Remove None entries, they should be ignored (as in Numpy)
+        idx = tuple(i for i in idx if i is not None)
+        tmp = self.__cgetitem__(idx)
+        gv = carray(v, self.ga.typecode, False, 'A', 0, self.context, GpuArray)
         array_setarray(tmp, gv)
 
     def take1(self, GpuArray idx):
+        """
+        take1(idx)
+        """
         cdef GpuArray res
         cdef size_t odim
         if idx.ga.nd != 1:
@@ -1845,7 +2138,7 @@ cdef class GpuArray:
             cdef size_t *newdims
             cdef unsigned int nd
             cdef unsigned int i
-            cdef GpuArray res
+            cdef int err
             nd = <unsigned int>len(newshape)
             newdims = <size_t *>calloc(nd, sizeof(size_t))
             if newdims == NULL:
@@ -1853,20 +2146,11 @@ cdef class GpuArray:
             try:
                 for i in range(nd):
                     newdims[i] = newshape[i]
-                res = new_GpuArray(GpuArray, self.context, None)
-                array_reshape(res, self, nd, newdims, GA_C_ORDER, 1)
+                err = GpuArray_reshape_inplace(&self.ga, nd, newdims, GA_C_ORDER)
+                if err != GA_NO_ERROR:
+                    raise get_exc(err), GpuArray_error(&self.ga, err)
             finally:
                 free(newdims)
-            # This is safe becase the reshape above is a nocopy one
-            free(self.ga.dimensions)
-            free(self.ga.strides)
-            self.ga.dimensions = res.ga.dimensions
-            self.ga.strides = res.ga.strides
-            self.ga.nd = res.ga.nd
-            res.ga.dimensions = NULL
-            res.ga.strides = NULL
-            res.ga.nd = 0
-            array_clear(res)
 
     property T:
         def __get__(self):
@@ -1890,6 +2174,16 @@ cdef class GpuArray:
                 res[i] = self.ga.strides[i]
             return tuple(res)
 
+        def __set__(self, newstrides):
+            cdef unsigned int i
+            if len(newstrides) != self.ga.nd:
+                raise ValueError("new strides are the wrong length")
+            if not strides_ok(self,  newstrides):
+                raise ValueError("new strides go outside of allocated memory")
+            for i in range(self.ga.nd):
+                self.ga.strides[i] = newstrides[i]
+            array_fix_flags(self)
+
     property ndim:
         "The number of dimensions in this object"
         def __get__(self):
@@ -1926,70 +2220,66 @@ cdef class GpuArray:
         def __get__(self):
             return self.ga.offset
 
+    property data:
+        """Return a pointer to the raw OpenCL buffer object.
+
+        This will fail for arrays that have an offset.
+        """
+        def __get__(self):
+            if self.context.kind != b"opencl":
+                raise TypeError("This is for OpenCL arrays.")
+            if self.offset != 0:
+                raise ValueError("This array has an offset.")
+            # This wizadry grabs the actual backend pointer since it's
+            # guarenteed to be the first element of the gpudata
+            # structure.
+            return <size_t>((<void **>self.ga.data)[0])
+
+    property base_data:
+        "Return a pointer to the backing OpenCL object."
+        def __get__(self):
+            if self.context.kind != b"opencl":
+                raise TypeError("This is for OpenCL arrays.")
+            # This wizadry grabs the actual backend pointer since it's
+            # guarenteed to be the first element of the gpudata
+            # structure.
+            return <size_t>((<void **>self.ga.data)[0])
+
     property gpudata:
         "Return a pointer to the raw backend object."
         def __get__(self):
+            if self.context.kind != b"cuda":
+                raise TypeError("This is for CUDA arrays.")
             # This wizadry grabs the actual backend pointer since it's
             # guarenteed to be the first element of the gpudata
             # structure.
-            return <size_t>((<void **>self.ga.data)[0])
+            return <size_t>((<void **>self.ga.data)[0]) + self.offset
 
     def __str__(self):
         return str(numpy.asarray(self))
 
     def __repr__(self):
-        return 'gpuarray.' + repr(numpy.asarray(self))
+        try:
+            return 'gpuarray.' + repr(numpy.asarray(self))
+        except Exception:
+            return 'gpuarray.array(<content not available>)'
 
 
 
 cdef class GpuKernel:
     """
-    .. code-block:: python
-
-        GpuKernel(source, name, types, context=None, cluda=True, have_double=False, have_small=False, have_complex=False, have_half=False)
+    GpuKernel(source, name, types, context=None, have_double=False, have_small=False, have_complex=False, have_half=False, cuda=False, opencl=False)
 
     Compile a kernel on the device
 
-    :param source: complete kernel source code
-    :type source: string
-    :param name: function name of the kernel
-    :type name: string
-    :param types: list of argument types
-    :type types: list or tuple
-    :param context: device on which the kernel is compiled
-    :type context: GpuContext
-    :param cluda: use cluda layer?
-    :param have_double: ensure working doubles?
-    :param have_small: ensure types smaller than float will work?
-    :param have_complex: ensure complex types will work?
-    :param have_half: ensure half-floats will work?
-    :param binary: kernel is pre-compiled binary blob?
-    :param ptx: kernel is PTX code?
-    :param cuda: kernel is cuda code?
-    :param opencl: kernel is opencl code?
-
     The kernel function is retrieved using the provided `name` which
     must match what you named your kernel in `source`.  You can safely
     reuse the same name multiple times.
 
-    .. note::
-
-        With the cuda backend, unless you use `cluda=True`, you must
-        either pass the mangled name of your kernel or declare the
-        function 'extern "C"', because cuda uses a C++ compiler
-        unconditionally.
-
     The `have_*` parameter are there to tell libgpuarray that we need
     the particular type or feature to work for this kernel.  If the
-    request can't be satified a
-    :class:`~pygpu.gpuarray.UnsupportedException` will be raised in the
-    constructor.
-
-    .. warning::
-
-        If you do not set the `have_` flags properly, you will either
-        get a device-specific error (the good case) or silent
-        completly bogus data (the bad case).
+    request can't be satisfied a :class:`.UnsupportedException` will be
+    raised in the constructor.
 
     Once you have the kernel object you can simply call it like so::
 
@@ -2003,13 +2293,51 @@ cdef class GpuKernel:
     sure to test against the size of your data.
 
     If you want more control over thread allocation you can use the
-    `ls` and `gs` parameters like so::
+    `gs` and `ls` parameters like so::
 
         k = GpuKernel(...)
-        k(param1, param2, ls=ls, gs=gs)
+        k(param1, param2, gs=gs, ls=ls)
 
     If you choose to use this interface, make sure to stay within the
-    limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail.
+    limits of `k.maxlsize` or the call will fail.
+
+    Parameters
+    ----------
+    source: str
+        complete kernel source code
+    name: str
+        function name of the kernel
+    types: list or tuple
+        list of argument types
+    context: GpuContext
+        device on which the kernel is compiled
+    have_double: bool
+        ensure working doubles?
+    have_small: bool
+        ensure types smaller than float will work?
+    have_complex: bool
+        ensure complex types will work?
+    have_half: bool
+        ensure half-floats will work?
+    cuda: bool
+        kernel is cuda code?
+    opencl: bool
+        kernel is opencl code?
+
+    Notes
+    -----
+    With the cuda backend, unless you use the cluda include, you must
+    either pass the mangled name of your kernel or declare the
+    function 'extern "C"', because cuda uses a C++ compiler
+    unconditionally.
+
+    .. warning::
+
+        If you do not set the `have_` flags properly, you will either
+        get a device-specific error (the good case) or silent
+        completely bogus data (the bad case).
+
+
     """
     def __dealloc__(self):
         cdef unsigned int numargs
@@ -2035,9 +2363,8 @@ cdef class GpuKernel:
         raise RuntimeError, "Cannot pickle GpuKernel object"
 
     def __cinit__(self, source, name, types, GpuContext context=None,
-                  cluda=True, have_double=False, have_small=False,
-                  have_complex=False, have_half=False, binary=False,
-                  cuda=False, opencl=False, *a, **kwa):
+                  have_double=False, have_small=False, have_complex=False,
+                  have_half=False, cuda=False, opencl=False, *a, **kwa):
         cdef const char *s[1]
         cdef size_t l
         cdef unsigned int numargs
@@ -2050,8 +2377,6 @@ cdef class GpuKernel:
 
         self.context = ensure_context(context)
 
-        if cluda:
-            flags |= GA_USE_CLUDA
         if have_double:
             flags |= GA_USE_DOUBLE
         if have_small:
@@ -2060,8 +2385,6 @@ cdef class GpuKernel:
             flags |= GA_USE_COMPLEX
         if have_half:
             flags |= GA_USE_HALF
-        if binary:
-            flags |= GA_USE_BINARY
         if cuda:
             flags |= GA_USE_CUDA
         if opencl:
@@ -2090,12 +2413,15 @@ cdef class GpuKernel:
         finally:
             free(_types)
 
-    def __call__(self, *args, n=None, ls=None, gs=None, shared=0):
-        if n == None and (ls == None or gs == None):
+    def __call__(self, *args, n=None, gs=None, ls=None, shared=0):
+        """
+        __call__(*args, n=None, gs=None, ls=None, shared=0)
+        """
+        if n is None and (ls is None or gs is None):
             raise ValueError, "Must specify size (n) or both gs and ls"
-        self.do_call(n, ls, gs, args, shared)
+        self.do_call(n, gs, ls, args, shared)
 
-    cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared):
+    cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared):
         cdef size_t n
         cdef size_t gs[3]
         cdef size_t ls[3]
@@ -2138,7 +2464,7 @@ cdef class GpuKernel:
                     raise ValueError, "nd mismatch for gs (int)"
                 gs[0] = py_gs
             elif isinstance(py_gs, (list, tuple)):
-                if len(py_gs) < 3:
+                if len(py_gs) > 3:
                     raise ValueError, "gs is not of length 3 or less"
                 if len(py_ls) != nd:
                     raise ValueError, "nd mismatch for gs (tuple)"
@@ -2162,8 +2488,8 @@ cdef class GpuKernel:
             if nd != 1:
                 raise ValueError, "n is specified and nd != 1"
             n = py_n
-            kernel_sched(self, n, &ls[0], &gs[0])
-        kernel_call(self, nd, ls, gs, shared, self.callbuf)
+            kernel_sched(self, n, &gs[0], &ls[0])
+        kernel_call(self, nd, gs, ls, shared, self.callbuf)
 
     cdef _setarg(self, unsigned int index, int typecode, object o):
         if typecode == GA_BUFFER:
@@ -2219,13 +2545,3 @@ cdef class GpuKernel:
             kernel_property(self, GA_KERNEL_PROP_NUMARGS, &res)
             return res
 
-    property _binary:
-        "Kernel compiled binary for the associated context."
-        def __get__(self):
-            cdef size_t sz
-            cdef char *bin
-            kernel_binary(self, &sz, <void **>&bin)
-            try:
-                return <bytes>bin[:sz]
-            finally:
-                free(bin)
diff --git a/pygpu/operations.py b/pygpu/operations.py
index bdb476b506..4908bb7bae 100644
--- a/pygpu/operations.py
+++ b/pygpu/operations.py
@@ -2,7 +2,7 @@
 
 from .gpuarray import _split, _concatenate, dtype_to_typecode
 from .dtypes import upcast
-from . import array, asarray
+from . import asarray
 
 
 def atleast_1d(*arys):
@@ -82,7 +82,8 @@ def array_split(ary, indices_or_sections, axis=0):
         # this madness is to support the numpy interface
         # it is supported by tests, but little else
         divs = (list(range(neach + 1, (neach + 1) * extra + 1, neach + 1)) +
-                list(range((neach + 1) * extra + neach, ary.shape[axis], neach)))
+                list(range((neach + 1) * extra + neach,
+                           ary.shape[axis], neach)))
         res = _split(ary, divs, axis)
     return res
 
diff --git a/pygpu/reduction.py b/pygpu/reduction.py
index 22f5a9c927..2880ff375b 100644
--- a/pygpu/reduction.py
+++ b/pygpu/reduction.py
@@ -29,12 +29,16 @@ def _ceil_log2(x):
     else:
         return 0
 
+
 basic_kernel = Template("""
+#include "cluda.h"
+
 ${preamble}
 
 #define REDUCE(a, b) (${reduce_expr})
 
-KERNEL void ${name}(const unsigned int n, ${out_arg.decltype()} out
+KERNEL void ${name}(const unsigned int n, ${out_arg.decltype()} out,
+                    const unsigned int out_off
 % for d in range(nd):
                     , const unsigned int dim${d}
 % endfor
@@ -61,6 +65,8 @@ def _ceil_log2(x):
   ${arg.name}_data = (${arg.decltype()})tmp;
   % endif
 % endfor
+  tmp = (GLOBAL_MEM char *)out; tmp += out_off;
+  out = (${out_arg.decltype()})tmp;
 
   i = GID_0;
 % for i in range(nd-1, -1, -1):
@@ -122,6 +128,7 @@ def _ceil_log2(x):
       ldata[lid] = REDUCE(ldata[lid], ldata[lid+${cur_size}]);
     }
   % endwhile
+  local_barrier();
   if (lid == 0) out[GID_0] = ldata[0];
 }
 """)
@@ -130,11 +137,6 @@ def _ceil_log2(x):
 class ReductionKernel(object):
     def __init__(self, context, dtype_out, neutral, reduce_expr, redux,
                  map_expr=None, arguments=None, preamble="", init_nd=None):
-        """
-        :param init_nd: used to pre compile the reduction code for
-            this value of nd and the self.init_local_size value.
-
-        """
         self.context = context
         self.neutral = neutral
         self.redux = tuple(redux)
@@ -146,10 +148,17 @@ def __init__(self, context, dtype_out, neutral, reduce_expr, redux,
         if isinstance(arguments, str):
             self.arguments = parse_c_args(arguments)
         elif arguments is None:
-            self.arguments = [ArrayArg(numpy.dtype(self.dtype_out), '_reduce_input')]
+            self.arguments = [ArrayArg(numpy.dtype(self.dtype_out),
+                                       '_reduce_input')]
         else:
             self.arguments = arguments
 
+        if (self.dtype_out == numpy.dtype('float16') or
+                any(ar.dtype == numpy.dtype('float16')
+                    for ar in self.arguments)):
+            raise NotImplementedError('float16 not supported for the '
+                                      'reduction interface')
+
         self.reduce_expr = reduce_expr
         if map_expr is None:
             if len(self.arguments) != 1:
@@ -184,7 +193,7 @@ def __init__(self, context, dtype_out, neutral, reduce_expr, redux,
 
         self.init_local_size = min(context.lmemsize //
                                    self.out_arg.dtype.itemsize,
-                                   context.maxlsize)
+                                   context.maxlsize0)
 
         # this is to prep the cache
         if init_nd is not None:
@@ -219,7 +228,7 @@ def _gen_basic(self, ls, nd):
                                   redux=self.redux,
                                   neutral=self.neutral,
                                   map_expr=self.expression)
-        spec = ['uint32', gpuarray.GpuArray]
+        spec = ['uint32', gpuarray.GpuArray, 'uint32']
         spec.extend('uint32' for _ in range(nd))
         for i, arg in enumerate(self.arguments):
             spec.append(arg.spec())
@@ -227,7 +236,7 @@ def _gen_basic(self, ls, nd):
                 spec.append('uint32')
                 spec.extend('int32' for _ in range(nd))
         k = gpuarray.GpuKernel(src, "reduk", spec, context=self.context,
-                               cluda=True, **self.flags)
+                               **self.flags)
         return k, src, spec
 
     @lru_cache()
@@ -250,7 +259,7 @@ def __call__(self, *args, **kwargs):
         if gs == 0:
             gs = 1
         n /= gs
-        if gs > self.context.maxgsize:
+        if gs > self.context.maxgsize0:
             raise ValueError("Array too big to be reduced along the "
                              "selected axes")
 
@@ -269,7 +278,7 @@ def __call__(self, *args, **kwargs):
         else:
             k, _, _, ls = self._get_basic_kernel(2**_ceil_log2(n), nd)
 
-        kargs = [n, out]
+        kargs = [n, out, out.offset]
         kargs.extend(dims)
         for i, arg in enumerate(args):
             kargs.append(arg)
@@ -277,7 +286,7 @@ def __call__(self, *args, **kwargs):
                 kargs.append(offsets[i])
                 kargs.extend(strs[i])
 
-        k(*kargs, ls=ls, gs=gs)
+        k(*kargs, gs=gs, ls=ls)
 
         return out
 
diff --git a/pygpu/tests/main.py b/pygpu/tests/main.py
index 6e48005215..6d0c7c960f 100644
--- a/pygpu/tests/main.py
+++ b/pygpu/tests/main.py
@@ -3,7 +3,7 @@
 
 from nose.config import Config
 from nose.plugins.manager import PluginManager
-from numpy.testing.nosetester import import_nose, NoseTester
+from numpy.testing.nosetester import NoseTester
 from numpy.testing.noseclasses import KnownFailure, NumpyTestProgram
 
 
@@ -20,14 +20,16 @@ def _test_argv(self, verbose, extra_argv):
         """
         Generate argv for nosetest command
 
-        :type verbose: int
-        :param verbose: Verbosity value for test outputs, in the range 1-10.
-                        Default is 1.
+        Parameters
+        ----------
+        verbose: int
+            Verbosity value for test outputs, in the range 1-10.
+            Default is 1.
+        extra_argv: list
+            List with any extra arguments to pass to nosetests.
 
-        :type extra_argv: list
-        :param extra_argv: List with any extra arguments to pass to nosetests.
         """
-        #self.package_path = os.path.abspath(self.package_path)
+        # self.package_path = os.path.abspath(self.package_path)
         argv = [__file__, self.package_path]
         argv += ['--verbosity', str(verbose)]
         if extra_argv:
@@ -35,32 +37,28 @@ def _test_argv(self, verbose, extra_argv):
         return argv
 
     def _show_system_info(self):
-        nose = import_nose()
-
         import pygpu
-        #print ("pygpu version %s" % pygpu.__version__)
+        # print ("pygpu version %s" % pygpu.__version__)
         pygpu_dir = os.path.dirname(pygpu.__file__)
-        print ("pygpu is installed in %s" % pygpu_dir)
+        print("pygpu is installed in %s" % pygpu_dir)
 
         super(NoseTester, self)._show_system_info()
 
     def prepare_test_args(self, verbose=1, extra_argv=None, coverage=False,
-            capture=True, knownfailure=True):
+                          capture=True, knownfailure=True):
         """
         Prepare arguments for the `test` method.
 
         Takes the same arguments as `test`.
         """
-        # fail with nice error message if nose is not present
-        nose = import_nose()
-
         # compile argv
         argv = self._test_argv(verbose, extra_argv)
 
         # numpy way of doing coverage
         if coverage:
-            argv += ['--cover-package=%s' % self.package_name, '--with-coverage',
-                    '--cover-tests', '--cover-inclusive', '--cover-erase']
+            argv += ['--cover-package=%s' % self.package_name,
+                     '--with-coverage', '--cover-tests', '--cover-inclusive',
+                     '--cover-erase']
 
         # Capture output only if needed
         if not capture:
@@ -75,33 +73,34 @@ def prepare_test_args(self, verbose=1, extra_argv=None, coverage=False,
         return argv, plugins
 
     def test(self, verbose=1, extra_argv=None, coverage=False, capture=True,
-            knownfailure=True):
+             knownfailure=True):
         """
         Run tests for module using nose.
 
-        :type verbose: int
-        :param verbose: Verbosity value for test outputs, in the range 1-10.
-                        Default is 1.
-
-        :type extra_argv: list
-        :param extra_argv: List with any extra arguments to pass to nosetests.
-
-        :type coverage: bool
-        :param coverage: If True, report coverage of pygpu code. Default is False.
+        Parameters
+        ----------
+        verbose: int
+            Verbosity value for test outputs, in the range 1-10.
+            Default is 1.
+        extra_argv: list
+            List with any extra arguments to pass to nosetests.
+        coverage: bool
+            If True, report coverage of pygpu code. Default is False.
+        capture: bool
+            If True, capture the standard output of the tests, like
+            nosetests does in command-line. The output of failing
+            tests will be displayed at the end. Default is True.
+        knownfailure: bool
+            If True, tests raising KnownFailureTest will not be
+            considered Errors nor Failure, but reported as "known
+            failures" and treated quite like skipped tests.  Default
+            is True.
+
+        Returns
+        -------
+        nose.result.TextTestResult
+            The result of running the tests
 
-        :type capture: bool
-        :param capture: If True, capture the standard output of the tests, like
-                        nosetests does in command-line. The output of failing
-                        tests will be displayed at the end. Default is True.
-
-        :type knownfailure: bool
-        :param knownfailure: If True, tests raising KnownFailureTest will
-                not be considered Errors nor Failure, but reported as
-                "known failures" and treated quite like skipped tests.
-                Default is True.
-
-        :returns: Returns the result of running the tests as a
-                  ``nose.result.TextTestResult`` object.
         """
         # cap verbosity at 3 because nose becomes *very* verbose beyond that
         verbose = min(verbose, 3)
@@ -119,7 +118,7 @@ def test(self, verbose=1, extra_argv=None, coverage=False, capture=True,
                 "launch pygpu.test()."))
 
         argv, plugins = self.prepare_test_args(verbose, extra_argv, coverage,
-                capture, knownfailure)
+                                               capture, knownfailure)
 
         # The "plugins" keyword of NumpyTestProgram gets ignored if config is
         # specified. Moreover, using "addplugins" instead can lead to strange
@@ -127,20 +126,3 @@ def test(self, verbose=1, extra_argv=None, coverage=False, capture=True,
         cfg = Config(includeExe=True, plugins=PluginManager(plugins=plugins))
         t = NumpyTestProgram(argv=argv, exit=False, config=cfg)
         return t.result
-
-
-def main(modulename):
-    debug = False
-
-    if 0:
-        unittest.main()
-    elif len(sys.argv)==2 and sys.argv[1]=="--debug":
-        module = __import__(modulename)
-        tests = unittest.TestLoader().loadTestsFromModule(module)
-        tests.debug()
-    elif len(sys.argv)==1:
-        module = __import__(modulename)
-        tests = unittest.TestLoader().loadTestsFromModule(module)
-        unittest.TextTestRunner(verbosity=2).run(tests)
-    else:
-        print ("options: [--debug]")
diff --git a/pygpu/tests/support.py b/pygpu/tests/support.py
index 2b3eb61f23..2eda88737f 100644
--- a/pygpu/tests/support.py
+++ b/pygpu/tests/support.py
@@ -1,6 +1,8 @@
 from __future__ import print_function
 
-import os, sys
+import os
+import sys
+
 import numpy
 from nose.plugins.skip import SkipTest
 
@@ -22,11 +24,14 @@
 dtypes_no_complex_big = ["float32", "float64", "int16", "uint16",
                          "int32", "int64", "uint32", "uint64"]
 
+
 def get_env_dev():
     for name in ['GPUARRAY_TEST_DEVICE', 'DEVICE']:
         if name in os.environ:
             return os.environ[name]
-    raise RuntimeError("No test device specified.  Specify one using the DEVICE or GPUARRAY_TEST_DEVICE environment variables.")
+    raise RuntimeError(
+        "No test device specified.  Specify one using the DEVICE "
+        "or GPUARRAY_TEST_DEVICE environment variables.")
 
 
 context = gpuarray.init(get_env_dev())
@@ -132,10 +137,11 @@ def gen_gpuarray(shape_orig, dtype='float32', offseted_outer=False,
     a += incr
 
     a = numpy.asarray(a, dtype=dtype)
+    b = gpuarray.array(a, context=ctx, cls=cls)
     assert order in ['c', 'f']
     if order == 'f' and len(shape) > 0:
         a = numpy.asfortranarray(a)
-    b = gpuarray.array(a, context=ctx, cls=cls)
+        b = gpuarray.asfortranarray(b)
     if order == 'f' and len(shape) > 0 and b.size > 1:
         assert b.flags['F_CONTIGUOUS']
 
diff --git a/pygpu/tests/test_basic.py b/pygpu/tests/test_basic.py
new file mode 100644
index 0000000000..95869a8a50
--- /dev/null
+++ b/pygpu/tests/test_basic.py
@@ -0,0 +1,84 @@
+import pygpu
+
+from pygpu.basic import (tril, triu)
+from unittest import TestCase
+from .support import (guard_devsup, gen_gpuarray, context)
+import numpy
+
+
+def test_tril():
+    for dtype in ['float32','float64']:
+        for shape in [(10, 5), (5, 10), (10, 10)]:
+            for order in ['c', 'f']:
+                for inplace in [True, False]:
+                    yield run_tril, dtype, shape, order, inplace
+
+@guard_devsup
+def run_tril(dtype, shape, order, inplace):
+    ac, ag = gen_gpuarray(shape, dtype, order=order, ctx=context)
+    result = tril(ag, inplace=inplace)
+    assert numpy.all(numpy.tril(ac) == result)
+    if inplace:
+        assert numpy.all(numpy.tril(ac) == ag)
+    else:
+        assert numpy.all(ac == ag)
+
+
+def test_triu():
+    for dtype in ['float32','float64']:
+        for shape in [(10, 5), (5, 10), (10, 10)]:
+            for order in ['c', 'f']:
+                for inplace in [True, False]:
+                    yield run_triu, dtype, shape, order, inplace
+
+@guard_devsup
+def run_triu(dtype, shape, order, inplace):
+    ac, ag = gen_gpuarray(shape, dtype, order=order, ctx=context)
+    result = triu(ag, inplace=inplace)
+    assert numpy.all(numpy.triu(ac) == result)
+    if inplace:
+        assert numpy.all(numpy.triu(ac) == ag)
+    else:
+        assert numpy.all(ac == ag)
+
+
+class test_errors(TestCase):
+
+    def runTest(self):
+        self.assertRaises(ValueError, self.run_1d_triu)
+        self.assertRaises(ValueError, self.run_3d_triu)
+        self.assertRaises(ValueError, self.run_1d_tril)
+        self.assertRaises(ValueError, self.run_3d_tril)
+
+        self.assertRaises(ValueError, self.run_noncontiguous_tril)
+        self.assertRaises(ValueError, self.run_noncontiguous_triu)
+
+    def run_1d_triu(self):
+        ac, ag = gen_gpuarray((10, ), 'float32', ctx=context)
+        triu(ag)
+
+    def run_3d_triu(self):
+        ac, ag = gen_gpuarray((10, 10, 10), 'float32', ctx=context)
+        triu(ag)
+
+    def run_1d_tril(self):
+        ac, ag = gen_gpuarray((10, ), 'float32', ctx=context)
+        tril(ag)
+
+    def run_3d_tril(self):
+        ac, ag = gen_gpuarray((10, 10, 10), 'float32', ctx=context)
+        tril(ag)
+
+    def run_noncontiguous_tril(self):
+        a = numpy.random.rand(5, 5)
+        b = pygpu.array(a, context=context)
+        b = b[::-1]
+        assert b.flags.c_contiguous is b.flags.f_contiguous is False
+        tril(b)
+
+    def run_noncontiguous_triu(self):
+        a = numpy.random.rand(5, 5)
+        b = pygpu.array(a, context=context)
+        b = b[::-1]
+        assert b.flags.c_contiguous is b.flags.f_contiguous is False
+        triu(b)
diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py
index 119ef8e959..e22d298547 100644
--- a/pygpu/tests/test_blas.py
+++ b/pygpu/tests/test_blas.py
@@ -1,4 +1,5 @@
-﻿import numpy
+﻿from itertools import product
+import numpy
 from nose.plugins.skip import SkipTest
 
 from .support import (guard_devsup, gen_gpuarray, context)
@@ -13,30 +14,68 @@
     raise SkipTest("no scipy blas to compare against")
 
 import pygpu.blas as gblas
+from pygpu.gpuarray import (GpuArrayException, UnsupportedException)
+
+def guard_devsup_blasdouble(func):
+    def f(*args, **kwargs):
+        try:
+            func(*args, **kwargs)
+        except UnsupportedException as e:
+            raise SkipTest("operation not supported")
+        except GpuArrayException as e:
+            if 'float64' in args and "does not support double precision" in str(e):
+                raise SkipTest("double precision not supported")
+            raise
+    return f
+
+
+def test_dot():
+    bools = [True, False]
+    for N, dtype, offseted_i, sliced in product(
+            [1, 256, 1337], ['float32', 'float64'], bools, bools):
+        yield dot, N, dtype, offseted_i, sliced, True, False
+    for overwrite, init_z in product(bools, bools):
+        yield dot, 666, 'float32', False, False, overwrite, init_z
+
+
+@guard_devsup_blasdouble
+def dot(N, dtype, offseted_i, sliced, overwrite, init_z):
+    cX, gX = gen_gpuarray((N,), dtype, offseted_inner=offseted_i,
+                          sliced=sliced, ctx=context)
+    cY, gY = gen_gpuarray((N,), dtype, offseted_inner=offseted_i,
+                          sliced=sliced, ctx=context)
+    if init_z:
+        gZ = gen_gpuarray((), dtype, offseted_inner=offseted_i,
+                          sliced=sliced, ctx=context)[1]
+    else:
+        gZ = None
+
+    # Always check against double precision: scipy's single precision
+    # has enough error that this sometimes fails when we're closer
+    cr = fblas.ddot(cX, cY)
+    gr = gblas.dot(gX, gY, gZ, overwrite_z=overwrite)
+    numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6)
+
 
 def test_gemv():
-    for shape in [(100, 128), (128, 50)]:
-        for order in ['f', 'c']:
-            for trans in [False, True]:
-                for offseted_i in [True, False]:
-                    for sliced in [1, 2, -1, -2]:
-                        yield gemv, shape, 'float32', order, trans, \
-                            offseted_i, sliced, True, False
-    for overwrite in [True, False]:
-        for init_y in [True, False]:
-            yield gemv, (4, 3), 'float32', 'f', False, False, 1, \
-                overwrite, init_y
+    bools = [False, True]
+    for shape, order, trans, offseted_i, sliced in product(
+            [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]):
+        yield (gemv, shape, 'float32', order, trans,
+               offseted_i, sliced, True, False)
+    for overwrite, init_y in product(bools, bools):
+        yield (gemv, (4, 3), 'float32', 'f', False, False, 1,
+               overwrite, init_y)
     yield gemv, (32, 32), 'float64', 'f', False, False, 1, True, False
-    for alpha in [0, 1, -1, 0.6]:
-        for beta in [0, 1, -1, 0.6]:
-            for overwite in [True, False]:
-                yield gemv, (32, 32), 'float32', 'f', False, False, 1, \
-                    overwrite, True, alpha, beta
+    for alpha, beta, overwrite in product(
+            [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools):
+        yield (gemv, (32, 32), 'float32', 'f', False, False, 1,
+               overwrite, True, alpha, beta)
 
 
-@guard_devsup
+@guard_devsup_blasdouble
 def gemv(shp, dtype, order, trans, offseted_i, sliced,
-          overwrite, init_y, alpha=1.0, beta=0.0):
+         overwrite, init_y, alpha=1.0, beta=0.0):
     cA, gA = gen_gpuarray(shp, dtype, order=order, offseted_inner=offseted_i,
                           sliced=sliced, ctx=context)
     if trans:
@@ -65,40 +104,34 @@ def gemv(shp, dtype, order, trans, offseted_i, sliced,
 
 
 def test_gemm():
-    for m, n, k in [(48, 15, 32), (15, 32, 48)]:
-        for order in [('f', 'f', 'f'), ('c', 'c', 'c'),
-                      ('f', 'f', 'c'), ('f', 'c', 'f'),
-                      ('f', 'c', 'c'), ('c', 'f', 'f'),
-                      ('c', 'f', 'c'), ('c', 'c', 'f')]:
-            for trans in [(False, False), (True, True),
-                          (False, True), (True, False)]:
-                for offseted_o in [False, True]:
-                    yield gemm, m, n, k, 'float32', order, trans, \
-                        offseted_o, 1, False, False
-    for sliced in [1, 2, -1, -2]:
-        for overwrite in [True, False]:
-            for init_res in [True, False]:
-                yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \
-                    (False, False), False, sliced, overwrite, init_res
-    yield gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), \
-        False, 1, False, False
-    for alpha in [0, 1, -1, 0.6]:
-        for beta in [0, 1, -1, 0.6]:
-            for overwrite in [True, False]:
-                yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \
-                    (False, False), False, 1, overwrite, True, alpha, beta
-
-@guard_devsup
+    bools = [False, True]
+    for (m, n, k), order, trans, offseted_o in product(
+        [(48, 15, 32), (15, 32, 48)], list(product(*['fc']*3)),
+            list(product(bools, bools)), bools):
+        yield (gemm, m, n, k, 'float32', order, trans,
+               offseted_o, 1, False, False)
+    for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools):
+        yield (gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'),
+               (False, False), False, sliced, overwrite, init_res)
+    yield (gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False),
+           False, 1, False, False)
+    for alpha, beta, overwrite in product(
+            [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools):
+        yield (gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'),
+               (False, False), False, 1, overwrite, True, alpha, beta)
+
+
+@guard_devsup_blasdouble
 def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite,
          init_res, alpha=1.0, beta=0.0):
     if trans[0]:
-        shpA = (k,m)
+        shpA = (k, m)
     else:
-        shpA = (m,k)
+        shpA = (m, k)
     if trans[1]:
-        shpB = (n,k)
+        shpB = (n, k)
     else:
-        shpB = (k,n)
+        shpB = (k, n)
 
     cA, gA = gen_gpuarray(shpA, dtype, order=order[0],
                           offseted_outer=offseted_o,
@@ -107,7 +140,7 @@ def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite,
                           offseted_outer=offseted_o,
                           sliced=sliced, ctx=context)
     if init_res:
-        cC, gC = gen_gpuarray((m,n), dtype, order=order[2], ctx=context)
+        cC, gC = gen_gpuarray((m, n), dtype, order=order[2], ctx=context)
     else:
         cC, gC = None, None
 
@@ -124,20 +157,15 @@ def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite,
 
 
 def test_ger():
-    for m, n in [(4, 5)]:
-        for order in ['f', 'c']:
-            for sliced_x in [1, 2, -2, -1]:
-                for sliced_y in [1, 2, -2, -1]:
-                    yield ger, m, n, 'float32', order, sliced_x, sliced_y, \
-                        False
-
+    bools = [False, True]
+    for (m, n), order, sliced_x, sliced_y in product(
+            [(4, 5)], 'fc', [1, 2, -2, -1], [1, 2, -2, -1]):
+        yield ger, m, n, 'float32', order, sliced_x, sliced_y, False
     yield ger, 4, 5, 'float64', 'f', 1, 1, False
+    for init_res, overwrite in product(bools, bools):
+        yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite
 
-    for init_res in [True, False]:
-        for overwrite in [True, False]:
-            yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite
-
-
+@guard_devsup_blasdouble
 def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False):
     cX, gX = gen_gpuarray((m,), dtype, order, sliced=sliced_x, ctx=context)
     cY, gY = gen_gpuarray((n,), dtype, order, sliced=sliced_y, ctx=context)
@@ -155,3 +183,61 @@ def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False):
     gr = gblas.ger(1.0, gX, gY, gA, overwrite_a=overwrite)
 
     numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6)
+
+
+def test_rgemmBatch_3d():
+    bools = [False, True]
+    for b, (m, n, k), order, trans, offseted_o in product(
+        [1, 17, 31], [(24, 7, 16), (7, 16, 24)],
+        list(product('fc', 'fc', 'c')),
+            list(product(bools, bools)), bools):
+        yield (rgemmBatch_3d, b, m, n, k, 'float32', order, trans,
+               offseted_o, 1, False, False)
+    for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools):
+        yield (rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'c'),
+               (False, False), False, sliced, overwrite, init_res)
+    yield (rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'c'),
+           (False, False), False, 1, False, False)
+    for alpha, beta, overwrite in product(
+            [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools):
+        yield (rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'c'),
+               (False, False), False, 1, overwrite, True, alpha, beta)
+
+
+@guard_devsup_blasdouble
+def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced,
+                  overwrite, init_res, alpha=1.0, beta=0.0):
+    if trans[0]:
+        shpA = (b, k, m)
+    else:
+        shpA = (b, m, k)
+    if trans[1]:
+        shpB = (b, n, k)
+    else:
+        shpB = (b, k, n)
+
+    cA, gA = gen_gpuarray(shpA, dtype, order=order[0],
+                          offseted_outer=offseted_o,
+                          sliced=sliced, ctx=context)
+    cB, gB = gen_gpuarray(shpB, dtype, order=order[1],
+                          offseted_outer=offseted_o,
+                          sliced=sliced, ctx=context)
+    if init_res:
+        cC, gC = gen_gpuarray((b, m, n), dtype, order=order[2], ctx=context)
+    else:
+        cC, gC = None, None
+
+    cr = numpy.empty((b, m, n), dtype=dtype)
+    if dtype == 'float32':
+        fn_gemm_c = fblas.sgemm
+    else:
+        fn_gemm_c = fblas.dgemm
+    for i in range(b):
+        cCi = cC if cC is None else cC[i]
+        cr[i] = fn_gemm_c(alpha, cA[i], cB[i], beta, cCi, trans_a=trans[0],
+                          trans_b=trans[1], overwrite_c=overwrite)
+
+    gr = gblas.gemmBatch_3d(alpha, gA, gB, beta, gC, trans_a=trans[0],
+                            trans_b=trans[1], overwrite_c=overwrite)
+
+    numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-5)
diff --git a/pygpu/tests/collectives/test_collectives.py b/pygpu/tests/test_collectives.py
similarity index 96%
rename from pygpu/tests/collectives/test_collectives.py
rename to pygpu/tests/test_collectives.py
index 9754873aab..4e688793d1 100644
--- a/pygpu/tests/collectives/test_collectives.py
+++ b/pygpu/tests/test_collectives.py
@@ -88,13 +88,6 @@ def test_richcmp(self):
         with self.assertRaises(TypeError):
             a = cid2 > "asdfasfa"
 
-    def test_as_buffer(self):
-        a = np.asarray(self.cid)
-        assert np.allclose(a, self.cid.comm_id)
-        a[:] = [ord(b'a')] * COMM_ID_BYTES
-        assert np.allclose(a, self.cid.comm_id)
-
-
 @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module")
 @unittest.skipIf(get_user_gpu_rank() == -1, "Collective operations supported on CUDA devices only")
 class TestGpuComm(unittest.TestCase):
@@ -108,7 +101,7 @@ def setUpClass(cls):
         cls.ctx = gpuarray.init("cuda" + str(cls.rank))
         print("*** Collectives testing for", cls.ctx.devname, file=sys.stderr)
         cls.cid = GpuCommCliqueId(context=cls.ctx)
-        cls.mpicomm.Bcast(cls.cid, root=0)
+        cls.mpicomm.Bcast(cls.cid.comm_id, root=0)
         cls.gpucomm = GpuComm(cls.cid, cls.size, cls.rank)
 
     def test_count(self):
@@ -293,19 +286,19 @@ def test_all_gather(self):
 
         a = cpu.reshape((5, 2), order='F')
         exp = texp.reshape((5, 2 * self.size), order='F')
-        gpu = gpuarray.asarray(a, context=self.ctx)
+        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
         resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
         check_all(resgpu, exp)
 
         a = cpu.reshape((5, 2), order='F')
         exp = texp.reshape((5, 2, self.size), order='F')
-        gpu = gpuarray.asarray(a, context=self.ctx)
+        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
         resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
         check_all(resgpu, exp)
 
         a = cpu.reshape((5, 2), order='F')
         exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
-        gpu = gpuarray.asarray(a, context=self.ctx)
+        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
         resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
         check_all(resgpu, exp)
 
diff --git a/pygpu/tests/test_elemwise.py b/pygpu/tests/test_elemwise.py
index d6ab88d9b3..cd0d8006be 100644
--- a/pygpu/tests/test_elemwise.py
+++ b/pygpu/tests/test_elemwise.py
@@ -1,7 +1,12 @@
 import operator
 import numpy
+from mako.template import Template
 
+from unittest import TestCase
 from pygpu import gpuarray, ndgpuarray as elemary
+from pygpu.dtypes import dtype_to_ctype, get_common_dtype
+from pygpu.elemwise import as_argument, ielemwise2
+from pygpu._elemwise import GpuElemwise, arg
 
 from six import PY2
 
@@ -59,10 +64,63 @@ def test_ielemwise2_ops_array():
                 yield ielemwise2_ops_array, op, dtype1, dtype2, (50,)
 
 
+class test_elemwise_output_not_broadcasted(TestCase):
+    def test_all(self):
+        test_values = [((1, 4), (6, 4)), ((2, 1, 8, 7), (2, 2, 8, 7))]
+        for shapea, shapeb in test_values:
+            # Sould fail: dimensions are not all equal.
+            self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb,
+                              False)
+            # Should fail: broascast should not be done on output.
+            self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb,
+                              True)
+            # Should fail: dimensions are not all equal.
+            self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb,
+                              shapea, False)
+            # Should fail: broadcast should not be done on output.
+            self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb,
+                              shapea, True)
+            # Should pass: output would be done on read-only input.
+            self.run_ielemwise2(shapeb, shapea, broadcast=True)
+            # Should pass: output would be done on read-only inputs.
+            self.check_elemwise2(shapea, shapea, shapeb, broadcast=True)
+            self.check_elemwise2(shapea, shapeb, shapeb, broadcast=True)
+            self.check_elemwise2(shapeb, shapea, shapeb, broadcast=True)
+
+    @guard_devsup
+    def run_ielemwise2(self, shapea, shapeb, broadcast=True):
+        na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary)
+        nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary)
+        ielemwise2(ga, '+', gb, broadcast=broadcast)
+        na += nb
+        assert numpy.allclose(na, numpy.asarray(ga), atol=1e-6)
+
+    @guard_devsup
+    def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True):
+        # We rewrite this version of elemwise2 to skip the scaling of output
+        # that is done in the official elemwise2 function.
+        na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary)
+        nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary)
+        odtype = get_common_dtype(ga, gb, True)
+        res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context,
+                             cls=ga.__class__)
+        a_arg = as_argument(ga, 'a', read=True)
+        b_arg = as_argument(gb, 'b', read=True)
+        res_arg = as_argument(res, 'res', write=True)
+        args = [res_arg, a_arg, b_arg]
+        oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % {
+            'op': '+', 'out_t': dtype_to_ctype(odtype)}
+        k = GpuElemwise(ga.context, oper, args, convert_f16=True)
+        k(res, ga, gb, broadcast=broadcast)
+        nres = na + nb
+        assert numpy.allclose(nres, numpy.asarray(res), atol=1e-6)
+
+
 @guard_devsup
 def elemwise2_ops_array(op, dtype1, dtype2, shape):
     ac, ag = gen_gpuarray(shape, dtype1, ctx=context, cls=elemary)
-    bc, bg = gen_gpuarray(shape, dtype2, nozeros=True, ctx=context, cls=elemary)
+    bc, bg = gen_gpuarray(shape, dtype2, nozeros=True, ctx=context,
+                          cls=elemary)
 
     out_c = op(ac, bc)
     out_g = op(ag, bg)
@@ -247,3 +305,42 @@ def broadcast(shapea, shapeb):
     rg = ag + bg
 
     check_meta_content(rg, rc)
+
+
+_inf_preamb_tpl = Template('''
+WITHIN_KERNEL ${flt}
+infinity() {return INFINITY;}
+
+WITHIN_KERNEL ${flt}
+neg_infinity() {return -INFINITY;}
+''')
+
+
+def test_infinity():
+    for dtype in ['float32', 'float64']:
+        yield infinity, dtype
+
+
+@guard_devsup
+def infinity(dtype):
+    ac, ag = gen_gpuarray((2,), dtype, ctx=context, cls=elemary)
+    out_g = ag._empty_like_me()
+    flt = 'ga_float' if dtype == 'float32' else 'ga_double'
+    out_arg = arg('out', out_g.dtype, scalar=False, read=False, write=True)
+    preamble = _inf_preamb_tpl.render(flt=flt)
+
+    # +infinity
+    ac[:] = numpy.inf
+    expr_inf = 'out = infinity()'
+    kernel = GpuElemwise(context, expr_inf, [out_arg],
+                         preamble=preamble)
+    kernel(out_g)
+    assert numpy.array_equal(ac, numpy.asarray(out_g))
+
+    # -infinity
+    ac[:] = -numpy.inf
+    expr_neginf = 'out = neg_infinity()'
+    kernel = GpuElemwise(context, expr_neginf, [out_arg],
+                         preamble=preamble)
+    kernel(out_g)
+    assert numpy.array_equal(ac, numpy.asarray(out_g))
diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py
index 6685fc8274..ef44cbcd41 100644
--- a/pygpu/tests/test_gpu_ndarray.py
+++ b/pygpu/tests/test_gpu_ndarray.py
@@ -8,8 +8,9 @@
 
 import numpy
 
+from nose.tools import assert_raises
 import pygpu
-from pygpu.gpuarray import GpuArray, GpuContext, GpuKernel
+from pygpu.gpuarray import GpuArray, GpuKernel
 
 from .support import (guard_devsup, check_meta, check_flags, check_all,
                       check_content, gen_gpuarray, context as ctx, dtypes_all,
@@ -40,12 +41,18 @@ def test_hash():
     g = pygpu.empty((2, 3), context=ctx)
     exc = None
     try:
-        h = hash(g)
+        hash(g)
     except TypeError as e:
         exc = e
     assert exc is not None
 
 
+def test_bool():
+    for data in [numpy.empty((0, 33)), [[1]], [[0]], [], [0], [1], 0, 1]:
+        assert (bool(pygpu.asarray(data, context=ctx)) ==
+                bool(numpy.asarray(data)))
+
+
 def test_transfer():
     for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
         for dtype in dtypes_all:
@@ -55,6 +62,8 @@ def test_transfer():
 
 def transfer(shp, dtype, offseted):
     a, b = gen_gpuarray(shp, dtype, offseted, ctx=ctx)
+    # Test that passing dtype doesn't break.
+    c = numpy.asarray(b, dtype=dtype)
     c = numpy.asarray(b)
 
     assert numpy.allclose(c, a)
@@ -63,12 +72,14 @@ def transfer(shp, dtype, offseted):
     assert a.dtype == b.dtype == c.dtype == dtype
     assert c.flags.c_contiguous
 
+
 def test_cast():
     for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
         for dtype1 in dtypes_no_complex:
             for dtype2 in dtypes_no_complex:
                     yield cast, shp, dtype1, dtype2
 
+
 @guard_devsup
 def cast(shp, dtype1, dtype2):
     a, b = gen_gpuarray(shp, dtype1, False, ctx=ctx)
@@ -89,8 +100,9 @@ def test_transfer_not_contiguous():
 @guard_devsup
 def transfer_not_contiguous(shp, dtype):
     a = numpy.random.rand(*shp) * 10
-    a = a[::-1]
     b = pygpu.array(a, context=ctx)
+    a = a[::-1]
+    b = b[::-1]
     c = numpy.asarray(b)
 
     assert numpy.allclose(c, a)
@@ -110,11 +122,12 @@ def test_transfer_fortran():
 @guard_devsup
 def transfer_fortran(shp, dtype):
     a = numpy.random.rand(*shp) * 10
+    b = pygpu.array(a, context=ctx)
     a_ = numpy.asfortranarray(a)
     if len(shp) > 1:
         assert a_.strides != a.strides
     a = a_
-    b = pygpu.array(a, context=ctx)
+    b = pygpu.asfortranarray(b)
     c = numpy.asarray(b)
 
     assert a.shape == b.shape == c.shape
@@ -132,8 +145,8 @@ def test_ascontiguousarray():
                 for offseted_i in [True, True]:
                     for sliced in [1, 2, -1, -2]:
                         for order in ['f', 'c']:
-                            yield ascontiguousarray, shp, dtype, offseted_o, \
-                                offseted_i, sliced, order
+                            yield (ascontiguousarray, shp, dtype, offseted_o,
+                                   offseted_i, sliced, order)
 
 
 @guard_devsup
@@ -145,8 +158,7 @@ def ascontiguousarray(shp, dtype, offseted_o, offseted_i, sliced, order):
     b = pygpu.ascontiguousarray(gpu)
 
     # numpy upcast with a view to 1d scalar.
-    if (sliced != 1 or shp == () or
-        (offseted_i and len(shp) > 1)):
+    if (sliced != 1 or shp == () or (offseted_i and len(shp) > 1)):
         assert b is not gpu
         if sliced == 1 and not offseted_i:
             assert (a.data is cpu.data) == (b.bytes is gpu.bytes)
@@ -169,8 +181,8 @@ def test_asfortranarray():
                 for offseted_inner in [True, False]:
                     for sliced in [1, 2, -1, -2]:
                         for order in ['f', 'c']:
-                            yield asfortranarray, shp, dtype, offseted_outer, \
-                                offseted_inner, sliced, order
+                            yield (asfortranarray, shp, dtype, offseted_outer,
+                                   offseted_inner, sliced, order)
 
 
 @guard_devsup
@@ -183,9 +195,9 @@ def asfortranarray(shp, dtype, offseted_outer, offseted_inner, sliced, order):
 
     # numpy upcast with a view to 1d scalar.
     if gpu.flags['F_CONTIGUOUS']:
-        assert b.gpudata == gpu.gpudata
+        assert ctx.kind != b'cuda' or b.gpudata == gpu.gpudata
     elif (sliced != 1 or shp == () or (offseted_outer and len(shp) > 1) or
-        (order != 'f' and len(shp) > 1)):
+          (order != 'f' and len(shp) > 1)):
         assert b is not gpu
     else:
         assert b is gpu
@@ -251,7 +263,7 @@ def empty(shp, order, dtype):
 
 
 def test_empty_no_dtype():
-    x = pygpu.empty((), context=ctx)# no dtype and order param
+    x = pygpu.empty((), context=ctx)  # no dtype and order param
     y = numpy.empty(())
     check_meta(x, y)
 
@@ -265,7 +277,7 @@ def test_empty_no_params():
 
 
 def test_mapping_getitem_ellipsis():
-    for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
+    for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
         for dtype in dtypes_all:
             for offseted in [True, False]:
                 yield mapping_getitem_ellipsis, shp, dtype, offseted
@@ -274,19 +286,43 @@ def test_mapping_getitem_ellipsis():
 def mapping_getitem_ellipsis(shp, dtype, offseted):
     a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx)
     b = a_gpu[...]
-    assert b.gpudata == a_gpu.gpudata
+    if ctx.kind == b'cuda':
+        assert b.gpudata == a_gpu.gpudata
     assert b.strides == a.strides
     assert b.shape == a.shape
     b_cpu = numpy.asarray(b)
     assert numpy.allclose(a, b_cpu)
 
 
-def test_mapping_setitem_ellipsis():
+def test_getitem_none():
+    for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
+        yield getitem_none, shp
+
+
+def getitem_none(shp):
+    a, a_gpu = gen_gpuarray(shp, ctx=ctx)
+
+    assert numpy.allclose(a_gpu[..., None], a[..., None])
+
+    for _ in range(5):
+        # Choose something to slice with, always works
+        indcs = tuple(numpy.random.choice([0, slice(None), slice(1, None)],
+                                          size=len(shp)))
+        indcs = indcs[:1] + (None,) + indcs[1:]
+        assert numpy.allclose(a_gpu[indcs], a[indcs])
+
+    if shp:
+        assert numpy.allclose(a_gpu[1:, None], a[1:, None])
+
+
+def test_mapping_setitem():
     for shp in [(9,), (8, 9), (4, 8, 9), (1, 8, 9)]:
         for dtype in dtypes_all:
             for offseted in [True, False]:
                 yield mapping_setitem_ellipsis, shp, dtype, offseted
                 yield mapping_setitem_ellipsis2, shp, dtype, offseted
+                yield mapping_setitem_firstaxis, shp, dtype, offseted
+
 
 @guard_devsup
 def mapping_setitem_ellipsis(shp, dtype, offseted):
@@ -295,13 +331,24 @@ def mapping_setitem_ellipsis(shp, dtype, offseted):
     a_gpu[...] = 2
     assert numpy.allclose(a, numpy.asarray(a_gpu))
 
+
 @guard_devsup
 def mapping_setitem_ellipsis2(shp, dtype, offseted):
     a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx)
     b, b_gpu = gen_gpuarray(shp[1:], dtype, False, ctx=ctx)
     a[:] = b
     a_gpu[:] = b_gpu
-    assert numpy.allclose(a, numpy.asarray(b_gpu))
+    assert numpy.allclose(a, numpy.asarray(a_gpu))
+
+
+@guard_devsup
+def mapping_setitem_firstaxis(shp, dtype, offseted):
+    a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx)
+    b, b_gpu = gen_gpuarray(shp[1:], dtype, False, ctx=ctx)
+    a[0] = b
+    a_gpu[0] = b_gpu
+    assert numpy.allclose(a, numpy.asarray(a_gpu))
+
 
 class WriteReadTest(unittest.TestCase):
     def setUp(self):
@@ -359,6 +406,7 @@ def test_read(self):
         self.cpu = numpy.ndarray((3, 4, 2, 5), dtype="float32", order='C')
         self.assertRaises(ValueError, self.gpu.read, self.cpu[:, :, 0, :])
 
+
 def test_copy_view():
     for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
         for dtype in dtypes_all:
@@ -371,12 +419,13 @@ def test_copy_view():
 
 
 def check_memory_region(a, a_op, b, b_op):
-    assert numpy.may_share_memory(a, a_op) == \
-        pygpu.gpuarray.may_share_memory(b, b_op)
+    assert (numpy.may_share_memory(a, a_op) ==
+            pygpu.gpuarray.may_share_memory(b, b_op))
+
 
 @guard_devsup
 def copy_view(shp, dtype, offseted, order1, order2):
-    #TODO test copy unbroadcast!
+    # TODO test copy unbroadcast!
     a, b = gen_gpuarray(shp, dtype, offseted, order=order1, ctx=ctx)
 
     assert numpy.allclose(a, numpy.asarray(b))
@@ -416,10 +465,10 @@ def test_shape():
                  ((4, 3), (12, -1)), ((4, 3), (-1, 12)),
                  ((5, 4, 3, 2), (2, -1, 12)), ((4, 2), (2, 2, -1)),
                  # ((4, 3), (13, -1)),
-    ]:
+                 ]:
         for offseted in [True, False]:
             for order1 in ['c', 'f']:
-                if not -1 in shps[1]:
+                if -1 not in shps[1]:
                     yield shape_, shps, offseted, order1
                 for order2 in ['a', 'c', 'f']:
                     yield reshape, shps, offseted, order1, order2
@@ -448,6 +497,32 @@ def reshape(shps, offseted, order1, order2):
     assert numpy.allclose(outc, numpy.asarray(outg))
 
 
+def test_strides():
+    yield strides_, (4, 4), 'c', 1, (4, 4)
+    yield strides_, (4, 4), 'c', 1, (4, 16)
+    yield strides_, (4, 4), 'c', 1, (16, 4)
+    yield strides_, (4, 4), 'c', 1, (16, 8)
+    yield strides_, (4, 4), 'c', 1, (16, 0)
+    yield strides_, (4, 4), 'c', -1, (-20, 4)
+    yield strides_, (4, 4), 'c', -1, (-12, 4)
+
+
+def set_strides(a, newstr):
+    a.strides = newstr
+
+
+def strides_(shp, order, sliced, newstr):
+    ac, ag = gen_gpuarray(shp, 'float32', sliced=sliced, order=order, ctx=ctx)
+    try:
+        ac.strides = newstr
+    except ValueError:
+        assert_raises(ValueError, set_strides, ag, newstr)
+        return
+    ag.strides = newstr
+    check_flags(ag, ac)
+    assert numpy.allclose(ac, numpy.asarray(ag))
+
+
 def test_transpose():
     for shp in [(2, 3), (4, 8, 9), (1, 2, 3, 4)]:
         for offseted in [True, False]:
@@ -455,7 +530,8 @@ def test_transpose():
                 for sliced in [1, 2, -2, -1]:
                     yield transpose, shp, offseted, sliced, order
                     for perm in permutations(list(range(len(shp)))):
-                        yield transpose_perm, shp, perm, offseted, sliced, order
+                        yield (transpose_perm, shp, perm, offseted, sliced,
+                               order)
 
 
 def transpose(shp, offseted, sliced, order):
@@ -527,8 +603,6 @@ def mapping_getitem_w_int(dtype, offseted):
     dim = (2,)
     a, _a = gen_gpuarray(dim, dtype, offseted, ctx=ctx)
 
-    import sys
-    init_ref_count = sys.getrefcount(_a)
     _cmp(_a[...], a[...])
     _cmp(_a[...], a[...])
     _cmp(_a[...], a[...])
@@ -560,19 +634,19 @@ def mapping_getitem_w_int(dtype, offseted):
     _cmpf(_a, (10, 0, 0, 0))
     _cmpf(_a, -10)
 
-    #test with integer
+    # test with integer
     _cmp(_a[1], a[1])
     _cmp(_a[-1], a[-1])
     _cmp(_a[numpy.int64(1)], a[numpy.int64(1)])
     _cmp(_a[numpy.int64(-1)], a[numpy.int64(-1)])
 
-    #test with slice
+    # test with slice
     _cmp(_a[1:], a[1:])
     _cmp(_a[1:2], a[1:2])
     _cmp(_a[-1:1], a[-1:1])
     _cmp(_a[6:7:], a[6:7:])
 
-    #test with tuple (mix slice, integer, numpy.int64)
+    # test with tuple (mix slice, integer, numpy.int64)
     _cmpNs(_a[0, 0, ::numpy.int64(-1), ::-1], a[0, 0, ::-1, ::-1])
     _cmpNs(_a[:, :, ::numpy.int64(-1), ::-1], a[:, :, ::-1, ::-1])
     _cmpNs(_a[:, :, numpy.int64(1), -1], a[:, :, 1, -1])
@@ -586,11 +660,11 @@ def mapping_getitem_w_int(dtype, offseted):
     _cmpNs(_a[0, ::-2, -1], a[0, ::-2, -1])
     _cmp(_a[-1, -1, -1, -2], a[-1, -1, -1, -2])
 
-    #test ellipse
+    # test ellipse
     _cmp(_a[...], a[...])
 
 
-def _cmp(x,y):
+def _cmp(x, y):
     assert isinstance(x, GpuArray)
     assert x.shape == y.shape
     assert x.dtype == y.dtype
@@ -691,6 +765,7 @@ def test_flags():
               'carray', 'forc', 'fnc', 'farray']:
         yield flag_prop, p
 
+
 def flag_dict(fl):
     c2, g2 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='c')
     c3, g3 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='f')
@@ -698,6 +773,7 @@ def flag_dict(fl):
     assert c2.flags[fl] == g2.flags[fl]
     assert c3.flags[fl] == g3.flags[fl]
 
+
 def flag_prop(p):
     c2, g2 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='c')
     c3, g3 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='f')
@@ -738,7 +814,9 @@ def test_GpuContext(self):
             pickle.dumps(ctx, protocol=-1)
 
     def test_GpuKernel(self):
-        k = GpuKernel("KERNEL void nothing(GLOBAL_MEM ga_float *in) {in[0] = 0;}", "nothing", [], context=ctx)
+        k = GpuKernel("#include \"cluda.h\"\nKERNEL void "
+                      "k(GLOBAL_MEM ga_float *in)"
+                      "{in[0] = 0;}", "k", [], context=ctx)
         with self.assertRaises(RuntimeError):
             pickle.dumps(k)
         with self.assertRaises(RuntimeError):
diff --git a/pygpu/tests/test_operations.py b/pygpu/tests/test_operations.py
index abf5e65d90..751450f638 100644
--- a/pygpu/tests/test_operations.py
+++ b/pygpu/tests/test_operations.py
@@ -1,7 +1,7 @@
 import numpy
 import pygpu
 
-from .support import (gen_gpuarray, context)
+from .support import (gen_gpuarray, context, SkipTest)
 
 
 def test_array_split():
@@ -21,11 +21,15 @@ def test_array_split():
     for pc, pg in zip(rc, rg):
         numpy.testing.assert_allclose(pc, numpy.asarray(pg))
 
+
 def test_split():
     for spl in (3, [3, 5, 6, 10]):
         yield xsplit, '', (9,), spl
 
+
 def test_xsplit():
+    if tuple(int(v) for v in numpy.version.version.split('.')[:2]) < (1, 11):
+        raise SkipTest("Numpy version too old")
     for l in ('h', 'v'):
         for spl in (2, [3, 6]):
             yield xsplit, l, (4, 4), spl
@@ -33,6 +37,7 @@ def test_xsplit():
     for spl in (2, [3, 6]):
         yield xsplit, 'd', (2, 2, 4), spl
 
+
 def xsplit(l, shp, spl):
     xc, xg = gen_gpuarray(shp, 'float32', ctx=context)
     n = l + 'split'
diff --git a/pygpu/tests/test_reduction.py b/pygpu/tests/test_reduction.py
index e24cf9b11e..fbb6882d94 100644
--- a/pygpu/tests/test_reduction.py
+++ b/pygpu/tests/test_reduction.py
@@ -1,5 +1,7 @@
 import numpy
 
+from nose.tools import assert_raises
+
 from pygpu import gpuarray, ndgpuarray as elemary
 from pygpu.reduction import ReductionKernel
 
@@ -50,8 +52,13 @@ def test_red_big_array():
                   [False, True, False]]:
         yield red_array_sum, 'float32', (2000, 30, 100), redux
 
-
+# this test needs a guard_devsup because Python 'float' is double,
+# and placing one directly on a test_* makes nose not know that it's a test
 def test_red_broadcast():
+    red_broadcast()
+
+@guard_devsup
+def red_broadcast():
     from pygpu.tools import as_argument
 
     dtype = float
@@ -70,7 +77,9 @@ def test_red_broadcast():
         nz = numpy.apply_along_axis(sum, ax, nz).astype(dtype)
 
     args = [as_argument(gx, 'a'), as_argument(gy, 'b')]
-    gz = ReductionKernel(context, dtype, "0", "a+b", redux, map_expr="a[i]*b[i]", arguments=args)(gx, gy, broadcast=True)
+    gz = ReductionKernel(context, dtype, "0", "a+b", redux,
+                         map_expr="a[i]*b[i]", arguments=args)(
+        gx, gy, broadcast=True)
 
     assert numpy.allclose(nz, numpy.asarray(gz))
 
@@ -83,6 +92,7 @@ def test_reduction_ops():
                 yield reduction_op, op, dtype, axis
 
 
+@guard_devsup
 def reduction_op(op, dtype, axis):
     c, g = gen_gpuarray((2, 3), dtype=dtype, ctx=context, cls=elemary)
 
@@ -130,3 +140,9 @@ def test_reduction_0d():
     rg = g.all()
 
     assert numpy.all(rc == numpy.asarray(rg))
+
+
+def test_reduction_f16():
+    c, g = gen_gpuarray((3,), dtype='float16', ctx=context, cls=elemary)
+
+    assert_raises(NotImplementedError, g.sum)
diff --git a/pygpu/tests/test_tools.py b/pygpu/tests/test_tools.py
index fafc835fcf..640b5685a6 100644
--- a/pygpu/tests/test_tools.py
+++ b/pygpu/tests/test_tools.py
@@ -1,27 +1,6 @@
-from pygpu.tools import (as_argument, Argument, ArrayArg, ScalarArg,
-                         check_contig, check_args, Counter, lfu_cache)
+from pygpu.tools import check_args
 
-
-from .support import (guard_devsup, rand, check_flags, check_meta, check_all,
-                      context, gen_gpuarray, dtypes_no_complex)
-
-
-def test_check_contig_1():
-    ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context)
-    bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context)
-    n, offsets, contig = check_contig((ag, bg))
-    assert n == 1000
-    assert offsets == (0, 0)
-    assert contig
-
-
-def test_check_contig_2():
-    ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context)
-    bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context, sliced=2)
-    n, offsets, contig = check_contig((ag, bg))
-    assert n == None
-    assert offsets == None
-    assert not contig
+from .support import context, gen_gpuarray
 
 
 def test_check_args_simple():
@@ -120,7 +99,7 @@ def test_check_args_broadcast_2():
                           offseted_inner=True)
     bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context)
     n, nd, dims, strs, offsets = check_args((ag, bg), collapse=True,
-                                                    broadcast=True)
+                                            broadcast=True)
     assert n == 1000
     assert nd == 2
     assert dims == (50, 20)
diff --git a/pygpu/tools.py b/pygpu/tools.py
index 30a336a42c..5e877e7893 100644
--- a/pygpu/tools.py
+++ b/pygpu/tools.py
@@ -10,7 +10,7 @@
 from .dtypes import dtype_to_ctype, _fill_dtype_registry
 from .gpuarray import GpuArray
 
-_fill_dtype_registry(respect_windows=False)
+_fill_dtype_registry()
 
 
 def as_argument(obj, name):
@@ -65,29 +65,6 @@ def spec(self):
         return self.dtype
 
 
-def check_contig(args):
-    dims = None
-    c_contig = f_contig = True
-    offsets = []
-    for arg in args:
-        if not isinstance(arg, GpuArray):
-            offsets.append(None)
-            continue
-
-        if dims is None:
-            dims = arg.shape
-            n = arg.size
-        elif arg.shape != dims:
-            return None, None, False
-        offsets.append(arg.offset)
-        fl = arg.flags
-        c_contig = c_contig and fl['C_CONTIGUOUS']
-        f_contig = f_contig and fl['F_CONTIGUOUS']
-    if not (c_contig or f_contig):
-        return None, None, False
-    return n, tuple(offsets), True
-
-
 def check_args(args, collapse=False, broadcast=False):
     """
     Returns the properties of arguments and checks if they all match
@@ -191,58 +168,6 @@ def check_args(args, collapse=False, broadcast=False):
     return n, nd, dims, tuple(strs), tuple(offsets)
 
 
-class Counter(dict):
-    'Mapping where default values are zero'
-    def __missing__(self, key):
-        return 0
-
-
-def lfu_cache(maxsize=20):
-    def decorating_function(user_function):
-        cache = {}
-        use_count = Counter()
-
-        @functools.wraps(user_function)
-        def wrapper(*key):
-            use_count[key] += 1
-
-            try:
-                result = cache[key]
-                wrapper.hits += 1
-            except KeyError:
-                result = user_function(*key)
-                cache[key] = result
-                wrapper.misses += 1
-
-                # purge least frequently used cache entry
-                if len(cache) > wrapper.maxsize:
-                    for key, _ in nsmallest(wrapper.maxsize // 10,
-                                            six.iteritems(use_count),
-                                            key=itemgetter(1)):
-                        del cache[key], use_count[key]
-
-            return result
-
-        def clear():
-            cache.clear()
-            use_count.clear()
-            wrapper.hits = wrapper.misses = 0
-
-        @functools.wraps(user_function)
-        def get(*key):
-            result = cache[key]
-            use_count[key] += 1
-            wrapper.hits += 1
-            return result
-
-        wrapper.hits = wrapper.misses = 0
-        wrapper.maxsize = maxsize
-        wrapper.clear = clear
-        wrapper.get = get
-        return wrapper
-    return decorating_function
-
-
 def lru_cache(maxsize=20):
     def decorating_function(user_function):
         cache = {}
@@ -252,7 +177,6 @@ def decorating_function(user_function):
         @functools.wraps(user_function)
         def wrapper(*key):
             time[0] += 1
-            last_use[key] = time[0]
 
             try:
                 result = cache[key]
@@ -264,11 +188,12 @@ def wrapper(*key):
 
                 # purge least recently used cache entries
                 if len(cache) > wrapper.maxsize:
-                    for key, _ in nsmallest(wrapper.maxsize // 10,
+                    for key0, _ in nsmallest(wrapper.maxsize // 10,
                                             six.iteritems(last_use),
                                             key=itemgetter(1)):
-                        del cache[key], last_use[key]
+                        del cache[key0], last_use[key0]
 
+            last_use[key] = time[0]
             return result
 
         def clear():
diff --git a/release.txt b/release.txt
new file mode 100644
index 0000000000..6e52ebfd9d
--- /dev/null
+++ b/release.txt
@@ -0,0 +1,11 @@
+Release process:
+- Make sure you are on the proper release branch
+- Make a git tag
+  git tag vX.Y.Z
+- Push to master the commit and the tag
+    git push --tags central master
+  This push will trigger package builds for windows and linux that will
+  be uploaded to the mila-udem conda channel.
+- Add a release on github with a tag in the form of 'vX.Y.Z'
+  https://github.com/Theano/libgpuarray/releases/new
+ - Make note of the major changes since the last release
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000..2cc056015a
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,7 @@
+[versioneer]
+VCS=git
+style=pep440
+versionfile_source=pygpu/_version.py
+versionfile_build=pygpu/_version.py
+tag_prefix=v
+parentdir_prefix=libgpuarray-
\ No newline at end of file
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 2e1f85878c..3de2bed819
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,16 @@
 import sys
 import os
+import versioneer
+import distutils.command.clean
+import shutil
 
 have_cython = False
 
 try:
     import Cython
-    if Cython.__version__ < '0.21':
+    if Cython.__version__ < '0.25':
         raise Exception('cython is too old or not installed '
-                        '(at least 0.21 required)')
+                        '(at least 0.25 required)')
     from Cython.Build import cythonize
     have_cython = True
 except Exception:
@@ -70,23 +73,44 @@ def __init__(self, *args, **kwargs):
 
 include_dirs = [np.get_include()]
 library_dirs = []
-if sys.platform == 'win32':
+if sys.platform == 'win32' and not os.getenv('CONDA_BUILD'):
     # This is a hack so users don't need to do many steps for windows install
     # Just use the default location.
     current_dir = os.path.abspath(os.path.dirname(__file__))
     include_dirs += [os.path.join(current_dir, 'src')]
 
-    default_bin_dir = os.path.join(current_dir, 'lib', 'Release')
+    default_bin_dir = os.path.join(current_dir, 'lib')
     if not os.path.isdir(default_bin_dir):
-        raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode')
+        raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir))
     library_dirs += [default_bin_dir]
 
+class cmd_clean(distutils.command.clean.clean):
+    def run(self):
+        import glob
+        with open('.clean', 'r') as f:
+            ignores = f.read()
+            for wildcard in filter(bool, ignores.split('\n')):
+                for filename in glob.glob(wildcard):
+                    try:
+                        os.remove(filename)
+                    except OSError:
+                        shutil.rmtree(filename, ignore_errors=True)
+
+        # It's an old-style class in Python 2.7...
+        distutils.command.clean.clean.run(self)
+
+
+ea = []
+if sys.platform in ('darwin', 'linux'):
+    # Silence unused stuff warnings
+    ea = ["-Wno-unused-variable", "-Wno-unused-function"]
 
 exts = [Extension('pygpu.gpuarray',
                   sources=['pygpu/gpuarray.pyx'],
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   ),
         Extension('pygpu.blas',
@@ -94,6 +118,7 @@ def __init__(self, *args, **kwargs):
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   ),
         Extension('pygpu._elemwise',
@@ -101,6 +126,7 @@ def __init__(self, *args, **kwargs):
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   ),
         Extension('pygpu.collectives',
@@ -108,16 +134,27 @@ def __init__(self, *args, **kwargs):
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   )]
 
+cmds = versioneer.get_cmdclass()
+cmds["clean"] = cmd_clean
+
+version_data = versioneer.get_versions()
+
+if version_data['error'] is not None:
+    raise ValueError("Can't determine version for build: %s\n  Please make sure that your git checkout includes tags." % (version_data['error'],))
+
 setup(name='pygpu',
-      version='0.2.1',
+      version=version_data['version'],
+      cmdclass=cmds,
       description='numpy-like wrapper on libgpuarray for GPU computations',
       packages=['pygpu', 'pygpu/tests'],
-      data_files=[('pygpu', ['pygpu/gpuarray.h', 'pygpu/gpuarray_api.h',
-                             'pygpu/blas_api.h', 'pygpu/numpy_compat.h',
-                             'pygpu/collectives.h', 'pygpu/collectives_api.h'])],
+      include_package_data=True,
+      package_data={'pygpu': ['gpuarray.h', 'gpuarray_api.h',
+                              'blas_api.h', 'numpy_compat.h',
+                              'collectives.h', 'collectives_api.h']},
       ext_modules=cythonize(exts),
-      install_requires=['mako>=0.7'],
+      install_requires=['mako>=0.7', 'six'],
       )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3bdf2b6be2..b687e5da1a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,15 +5,6 @@ if(CMAKE_COMPILER_IS_GNUCC)
   add_definitions(-Wdeclaration-after-statement)
 endif()
 
-find_package(CUDA)
-find_package(OpenCL)
-if(OpenCL_FOUND)
-find_package(clBLAS)
-endif()
-if(CUDA_FOUND)
-find_package(NCCL)
-endif()
-
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
 
 add_custom_command(
@@ -23,6 +14,22 @@ add_custom_command(
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/gen_types.py)
 
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h.c
+  COMMAND python head.py cluda_cuda.h
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/head.py
+          ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h
+  )
+
+add_custom_command(
+  OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h.c
+  COMMAND python head.py cluda_opencl.h
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/head.py
+          ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h
+  )
+
 macro (set_rel var)
   file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}")
   # clear previous list (if any)
@@ -44,6 +51,7 @@ endmacro()
 set(_GPUARRAY_SRC
 cache/lru.c
 cache/twoq.c
+cache/disk.c
 gpuarray_types.c
 gpuarray_error.c
 gpuarray_util.c
@@ -56,8 +64,18 @@ gpuarray_array_collectives.c
 gpuarray_kernel.c
 gpuarray_extension.c
 gpuarray_elemwise.c
+gpuarray_reduction.c
+gpuarray_buffer_cuda.c
+gpuarray_blas_cuda_cublas.c
+gpuarray_collectives_cuda_nccl.c
+gpuarray_buffer_opencl.c
+gpuarray_blas_opencl_clblas.c
+gpuarray_blas_opencl_clblast.c
 )
 
+set_property(SOURCE gpuarray_buffer_cuda.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h.c)
+set_property(SOURCE gpuarray_buffer_opencl.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h.c)
+
 check_function_exists(strlcat HAVE_STRL)
 check_function_exists(mkstemp HAVE_MKSTEMP)
 
@@ -73,115 +91,43 @@ if(NOT HAVE_MKSTEMP)
   list(APPEND _GPUARRAY_SRC gpuarray_mkstemp.c)
 endif()
 
-if (CUDA_FOUND)
-  if(NCCL_FOUND)
-    if (CUDA_VERSION_MAJOR LESS 7)
-      message( WARNING "This package requires CUDA 7.0 or more (building with NCCL).  Found version ${CUDA_VERSION_STRING}")
-      set(CUDA_FOUND 0)
-    endif()
-  else(NCCL_FOUND)
-    if (CUDA_VERSION_MAJOR LESS 6 OR
-        (CUDA_VERSION_MAJOR EQUAL 6 AND CUDA_VERSION_MINOR EQUAL 0))
-      message( WARNING "This package requires CUDA 6.5 or more.  Found version ${CUDA_VERSION_STRING}")
-      set(CUDA_FOUND 0)
-    endif()
-  endif(NCCL_FOUND)
-endif()
-
-if (CUDA_FOUND)
-  if (APPLE)
-    FIND_LIBRARY(CUDADRV_LIBRARY CUDA)
-    FIND_PATH(CUDADRV_INCLUDE CUDA/cuda.h)
-    # this is somewhat a hack, but otherwise cublas_v2.h isn't found
-    set(CUDADRV_INCLUDE ${CUDADRV_INCLUDE} ${CUDA_TOOLKIT_INCLUDE})
-  endif()
-  if(NOT CUDADRV_LIBRARY)
-    set(CUDADRV_LIBRARY ${CUDA_CUDA_LIBRARY})
-    set(CUDADRV_INCLUDE ${CUDA_TOOLKIT_INCLUDE})
-  endif()
-
-  find_cuda_helper_libs(nvrtc)
-
-  if(CUDA_nvrtc_LIBRARY)
-    message(STATUS "Building with NVRTC")
-    add_definitions(-DWITH_NVRTC)
-    set(CUDADRV_LIBRARY ${CUDADRV_LIBRARY} ${CUDA_nvrtc_LIBRARY})
-  else()
-    add_definitions(-DNVCC_BIN=${CUDA_NVCC_EXECUTABLE})
-  endif()
-
-  list(APPEND _GPUARRAY_SRC gpuarray_buffer_cuda.c)
-  add_definitions(-DWITH_CUDA)
-  include_directories(${CUDADRV_INCLUDE})
-
-  list(APPEND _GPUARRAY_SRC gpuarray_blas_cuda_cublas.c)
-  add_definitions(-DWITH_CUDA_CUBLAS)
-
-  set(CMAKE_REQUIRED_LIBRARIES ${CUDA_CUBLAS_LIBRARIES})
-
-  check_function_exists(cublasSgemmEx CUBLAS_SGEMMEX)
-  if (CUBLAS_SGEMMEX)
-    add_definitions(-DHAVE_CUBLAS_SGEMMEX)
-  endif()
-
-  if(NCCL_FOUND)
-    message(STATUS "Building with NCCL")
-    set(BUILD_WITH_COLLECTIVES 1 PARENT_SCOPE)
-    add_definitions(-DWITH_CUDA_NCCL)
-    list(APPEND _GPUARRAY_SRC gpuarray_collectives_cuda_nccl.c)
-    include_directories(${NCCL_INCLUDE_DIR})
-  endif()
-endif()
-
-if(OpenCL_FOUND)
-  list(APPEND _GPUARRAY_SRC gpuarray_buffer_opencl.c)
-  add_definitions(-DWITH_OPENCL)
-  include_directories(${OpenCL_INCLUDE_DIRS})
-
-  if(CLBLAS_FOUND)
-    message(STATUS "Building with CLBLAS")
-    list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblas.c)
-    add_definitions(-DWITH_OPENCL_CLBLAS)
-    include_directories(${CLBLAS_INCLUDE_DIRS})
-  endif()
-endif()
-
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/private_config.h.in
   ${CMAKE_CURRENT_SOURCE_DIR}/private_config.h
   )
 
 add_subdirectory(util)
+add_subdirectory(loaders)
 
 set_rel(GPUARRAY_SRC ${_GPUARRAY_SRC})
-list(APPEND GPUARRAY_SRC ${UTIL_SRC})
+list(APPEND GPUARRAY_SRC ${UTIL_SRC} ${LOADERS_SRC})
 
 add_library(gpuarray SHARED ${GPUARRAY_SRC})
 set_target_properties(gpuarray PROPERTIES
   COMPILE_FLAGS "-DGPUARRAY_BUILDING_DLL -DGPUARRAY_SHARED"
   INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
   MACOSX_RPATH OFF
+  # This is the shared library version
+  VERSION 3.0
   )
 
 add_library(gpuarray-static STATIC ${GPUARRAY_SRC})
 
-if(CUDA_FOUND)
-  target_link_libraries(gpuarray ${CUDADRV_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
-  target_link_libraries(gpuarray-static ${CUDADRV_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
-  if (NCCL_FOUND)
-    target_link_libraries(gpuarray ${NCCL_LIBRARY})
-    target_link_libraries(gpuarray-static ${NCCL_LIBRARY})
-  endif()
-endif()
+target_link_libraries(gpuarray ${CMAKE_DL_LIBS})
+target_link_libraries(gpuarray-static ${CMAKE_DL_LIBS})
+
+# Generate gpuarray/abi_version.h that contains the ABI version number.
+get_target_property(GPUARRAY_ABI_VERSION gpuarray VERSION)
+string(REPLACE "." ";" GPUARRAY_ABI_VERSION_NUMBERS ${GPUARRAY_ABI_VERSION})
+list(GET GPUARRAY_ABI_VERSION_NUMBERS 0 GPUARRAY_ABI_VERSION_MAJOR)
+list(GET GPUARRAY_ABI_VERSION_NUMBERS 1 GPUARRAY_ABI_VERSION_MINOR)
+math(EXPR GPUARRAY_ABI_NUMBER "1000*${GPUARRAY_ABI_VERSION_MAJOR} + ${GPUARRAY_ABI_VERSION_MINOR}")
+FILE(WRITE gpuarray/abi_version.h
+"\#ifndef GPUARRAY_ABI_VERSION\n\#define GPUARRAY_ABI_VERSION ${GPUARRAY_ABI_NUMBER}\n\#endif\n"
+)
 
-if(OpenCL_FOUND)
-  target_link_libraries(gpuarray ${OpenCL_LIBRARIES})
-  target_link_libraries(gpuarray-static ${OpenCL_LIBRARIES})
-  if (CLBLAS_FOUND)
-    target_link_libraries(gpuarray ${CLBLAS_LIBRARIES})
-    target_link_libraries(gpuarray-static ${CLBLAS_LIBRARIES})
-  endif()
-endif()
+# set SOVERSION and ensure it is the first part of VERSION.
+set_property(TARGET gpuarray PROPERTY SOVERSION ${GPUARRAY_ABI_VERSION_MAJOR})
 
 set(headers
   gpuarray/array.h
@@ -190,6 +136,7 @@ set(headers
   gpuarray/buffer.h
   gpuarray/buffer_blas.h
   gpuarray/buffer_collectives.h
+  gpuarray/abi_version.h
   gpuarray/config.h
   gpuarray/elemwise.h
   gpuarray/error.h
diff --git a/src/cache.h b/src/cache.h
index 800208e04d..47901c2beb 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -1,9 +1,11 @@
 #ifndef CACHE_H
 #define CACHE_H
 
-#include <stdint.h>
 #include <stdlib.h>
+#include <gpuarray/config.h>
 #include "private_config.h"
+#include "util/strb.h"
+#include "util/error.h"
 
 typedef void *cache_key_t;
 typedef void *cache_value_t;
@@ -13,6 +15,11 @@ typedef uint32_t (*cache_hash_fn)(cache_key_t);
 typedef void (*cache_freek_fn)(cache_key_t);
 typedef void (*cache_freev_fn)(cache_value_t);
 
+typedef int (*kwrite_fn)(strb *res, cache_key_t key);
+typedef int (*vwrite_fn)(strb *res, cache_value_t val);
+typedef cache_key_t (*kread_fn)(const strb *b);
+typedef cache_value_t (*vread_fn)(const strb *b);
+
 typedef struct _cache cache;
 
 struct _cache {
@@ -71,12 +78,19 @@ struct _cache {
 
 cache *cache_lru(size_t max_size, size_t elasticity,
                  cache_eq_fn keq, cache_hash_fn khash,
-                 cache_freek_fn kfree, cache_freev_fn vfree);
+                 cache_freek_fn kfree, cache_freev_fn vfree,
+                 error *e);
 
 cache *cache_twoq(size_t hot_size, size_t warm_size,
                   size_t cold_size, size_t elasticity,
                   cache_eq_fn keq, cache_hash_fn khash,
-                  cache_freek_fn kfree, cache_freev_fn vfree);
+                  cache_freek_fn kfree, cache_freev_fn vfree,
+                  error *e);
+
+cache *cache_disk(const char *dirpath, cache *mem,
+                  kwrite_fn kwrite, vwrite_fn vwrite,
+                  kread_fn kread, vread_fn vread,
+                  error *e);
 
 /* API functions */
 static inline int cache_add(cache *c, cache_key_t k, cache_value_t v) {
diff --git a/src/cache/disk.c b/src/cache/disk.c
new file mode 100644
index 0000000000..0f9de82e0a
--- /dev/null
+++ b/src/cache/disk.c
@@ -0,0 +1,481 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+#include "private_config.h"
+
+#ifdef _WIN32
+#define PATH_MAX 255
+
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+
+#include <process.h>
+#include <direct.h>
+#include <io.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+struct timezone;
+
+struct timeval {
+  long tv_sec;
+  long tv_usec;
+} timeval;
+
+static int gettimeofday(struct timeval *tp, struct timezone *tzp) {
+  /*
+   * Note: some broken versions only have 8 trailing zero's, the
+   * correct epoch has 9 trailing zero's This magic number is the
+   * number of 100 nanosecond intervals since January 1, 1601 (UTC)
+   * until 00:00:00 January 1, 1970
+   */
+  static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL);
+
+  SYSTEMTIME system_time;
+  FILETIME file_time;
+  uint64_t time;
+
+  GetSystemTime(&system_time);
+  SystemTimeToFileTime(&system_time, &file_time);
+  time = ((uint64_t)file_time.dwLowDateTime);
+  time += ((uint64_t)file_time.dwHighDateTime) << 32;
+
+  tp->tv_sec = (long)((time - EPOCH) / 10000000L);
+  tp->tv_usec = (long)(system_time.wMilliseconds * 1000);
+  return 0;
+}
+
+#define open _open
+#define unlink _unlink
+#define mkdir(p, f) _mkdir(p)
+#define close _close
+#define strdup _strdup
+#define lstat _stat64
+#define fstat _fstat64
+#define stat __stat64
+
+#else
+#define PATH_MAX 1024
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+
+#define O_BINARY 0
+#define _setmode(a, b)
+
+#endif
+
+
+#include "cache.h"
+#include "util/skein.h"
+
+#define HEXP_LEN (128 + 2)
+
+typedef struct _disk_cache {
+  cache c;
+  cache * mem;
+  kwrite_fn kwrite;
+  vwrite_fn vwrite;
+  kread_fn kread;
+  vread_fn vread;
+  const char *dirp;
+} disk_cache;
+
+
+/* Convert unsigned long long from network to host order */
+static unsigned long long ntohull(const char *_in) {
+  const unsigned char *in = (const unsigned char *)_in;
+  return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 |
+          (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 |
+          (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 |
+          (unsigned long long)in[6] << 8 | (unsigned long long)in[7]);
+}
+
+/* Convert unsigned long long from host to network order */
+static void htonull(unsigned long long in, char *out) {
+  out[0] = (unsigned char)(in >> 56);
+  out[1] = (unsigned char)(in >> 48);
+  out[2] = (unsigned char)(in >> 40);
+  out[3] = (unsigned char)(in >> 32);
+  out[4] = (unsigned char)(in >> 24);
+  out[5] = (unsigned char)(in >> 16);
+  out[6] = (unsigned char)(in >> 8);
+  out[7] = (unsigned char)(in);
+}
+
+/* Concatenate prefix and suffix into a single path string while
+   checking for overflow */
+static int catp(char *path, const char *dirp, const char *rpath) {
+  if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) {
+    errno = ENAMETOOLONG;
+    return -1;
+  }
+  if (strlcat(path, rpath, PATH_MAX) >= PATH_MAX) {
+    errno = ENAMETOOLONG;
+    return -1;
+  }
+  return 0;
+}
+
+/* open() for a path specifed by the concatenation of dirp and rpath */
+static int openp(const char *dirp, const char *rpath, int flags, int mode) {
+  char path[PATH_MAX];
+
+  if (catp(path, dirp, rpath))
+    return -1;
+
+  return open(path, flags, mode);
+}
+
+static int mkstempp(const char *dirp, char *template) {
+  char path[PATH_MAX];
+  int res;
+
+  if (catp(path, dirp, template))
+    return -1;
+
+  res = mkstemp(path);
+
+  /* We need to copy the result path back and set binary mode (for windows) */
+  if (res != -1) {
+    _setmode(res, O_BINARY);
+    memcpy(template, &path[strlen(dirp)], strlen(template));
+  }
+
+  return res;
+}
+
+static int unlinkp(const char *dirp, const char *rpath) {
+  char path[PATH_MAX];
+
+  if (catp(path, dirp, rpath))
+    return -1;
+
+  return unlink(path);
+}
+
+static int renamep(const char *dirp, const char *ropath, const char *rnpath) {
+  char opath[PATH_MAX];
+  char npath[PATH_MAX];
+
+  if (catp(opath, dirp, ropath))
+    return -1;
+  if (catp(npath, dirp, rnpath))
+    return -1;
+
+  return rename(opath, npath);
+}
+
+/* Ensure that a path exists by creating all intermediate directories */
+int ensurep(const char *dirp, const char *rpath) {
+  char path[PATH_MAX];
+  char *pp;
+  char sep;
+
+  if (dirp == NULL) {
+    if (strlcpy(path, rpath, PATH_MAX) >= PATH_MAX) {
+      errno = ENAMETOOLONG;
+      return -1;
+    }
+#ifdef _WIN32
+    /* Skip root dir (windows) */
+    pp = strchr(path, '\\');
+    if (pp)
+      while (*pp == '\\') pp++;
+    else
+      pp = path;
+#else
+    pp = path;
+    /* Skip root dir (unix) */
+    while (*pp == '/') pp++;
+#endif
+  } else {
+    if (catp(path, dirp, rpath))
+      return -1;
+
+    pp = path + strlen(dirp);
+  }
+  while ((pp = strpbrk(pp + 1, "\\/")) != NULL) {
+    sep = *pp;
+    *pp = '\0';
+    if (mkdir(path, 0777)) {
+      if (errno != EEXIST) return -1;
+      /* For now we suppose that EEXIST means that the directory is
+       * already there. */
+    }
+    *pp = sep;
+  }
+
+  return 0;
+}
+
+static int key_path(disk_cache *c, const cache_key_t key, char *out) {
+  strb kb = STRB_STATIC_INIT;
+  unsigned char hash[64];
+  int i;
+
+  if (c->kwrite(&kb, key)) {
+    strb_clear(&kb);
+    return -1;
+  }
+  if (Skein_512((unsigned char *)kb.s, kb.l, hash)) {
+    strb_clear(&kb);
+    return -1;
+  }
+  strb_clear(&kb);
+  if (snprintf(out, 10, "%02x%02x/%02x%02x",
+               hash[0], hash[1], hash[2], hash[3]) != 9)
+    return -1;
+  for (i = 4; i < 64; i += 4) {
+    if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x",
+                 hash[i], hash[i+1], hash[i+2], hash[i+3]) != 8)
+      return -1;
+  }
+  return 0;
+}
+
+static int write_entry(disk_cache *c, const cache_key_t k,
+                       const cache_value_t v) {
+  char hexp[HEXP_LEN];
+  char tmp_path[] = "tmp.XXXXXXXX";
+  strb b = STRB_STATIC_INIT;
+  size_t kl, vl;
+  int fd, err;
+
+  if (key_path(c, k, hexp)) return -1;
+
+  if (ensurep(c->dirp, hexp)) return -1;
+
+  if (strb_ensure(&b, 16)) return -1;
+  b.l = 16;
+  c->kwrite(&b, k);
+  kl = b.l - 16;
+  c->vwrite(&b, v);
+  vl = b.l - kl - 16;
+  htonull(kl, b.s);
+  htonull(vl, b.s + 8);
+  if (strb_error(&b)) {
+    strb_clear(&b);
+    return -1;
+  }
+
+  fd = mkstempp(c->dirp, tmp_path);
+  if (fd == -1) {
+    strb_clear(&b);
+    return -1;
+  }
+
+  err = strb_write(fd, &b);
+  strb_clear(&b);
+  close(fd);
+  if (err) {
+    unlinkp(c->dirp, tmp_path);
+    return -1;
+  }
+
+  if (renamep(c->dirp, tmp_path, hexp)) {
+    unlinkp(c->dirp, tmp_path);
+#ifdef _WIN32
+    /* On windows we can't rename over an existing file */
+    return (errno != EACCES) ? -1 : 0;
+#else
+    return -1;
+#endif
+  }
+
+  return 0;
+}
+
+static int find_entry(disk_cache *c, const cache_key_t key,
+                      cache_key_t *_k, cache_value_t *_v) {
+  struct stat st;
+  strb b = STRB_STATIC_INIT;
+  char *ts;
+  size_t kl, vl;
+  cache_key_t k;
+  char hexp[HEXP_LEN];
+  int fd;
+
+  if (key_path(c, key, hexp)) return 0;
+
+  fd = openp(c->dirp, hexp, O_RDONLY|O_BINARY, 0);
+
+  if (fd == -1) return 0;
+
+  if (fstat(fd, &st)) {
+    close(fd);
+    return 0;
+  }
+
+  if (!(st.st_mode & S_IFREG)) {
+    close(fd);
+    return 0;
+  }
+
+  strb_read(&b, fd, st.st_size);
+  close(fd);
+
+  if (strb_error(&b) || b.l < 16) {
+    strb_clear(&b);
+    return 0;
+  }
+
+  kl = ntohull(b.s);
+  vl = ntohull(b.s + 8);
+
+  if (b.l < 16 + kl + vl) {
+    strb_clear(&b);
+    return 0;
+  }
+
+  ts = b.s;
+
+  b.s += 16;
+  b.l = kl;
+
+  k = c->kread(&b);
+  if (k && c->c.keq(key, k)) {
+    if (_v) {
+      b.s += kl;
+      b.l = vl;
+      *_v = c->vread(&b);
+      if (*_v == NULL)
+        goto error_find_entry;
+    }
+    if (_k)
+      *_k = k;
+    else
+      c->c.kfree(k);
+    b.s = ts;
+    strb_clear(&b);
+    return 1;
+  }
+ error_find_entry:
+  if (k)
+    c->c.kfree(k);
+  b.s = ts;
+  strb_clear(&b);
+  return 0;
+}
+
+static int disk_add(cache *_c, cache_key_t k, cache_value_t v) {
+  disk_cache *c = (disk_cache *)_c;
+
+  /* Ignore write errors */
+  write_entry(c, k, v);
+
+  return cache_add(c->mem, k, v);
+}
+
+static int disk_del(cache *_c, const cache_key_t key) {
+  disk_cache *c = (disk_cache *)_c;
+  char hexp[HEXP_LEN] = {0};
+
+  cache_del(c->mem, key);
+
+  key_path(c, key, hexp);
+
+  return (unlinkp(c->dirp, hexp) == 0);
+}
+
+static cache_value_t disk_get(cache *_c, const cache_key_t key) {
+  disk_cache *c = (disk_cache *)_c;
+  cache_key_t k;
+  cache_value_t v;
+
+  v = cache_get(c->mem, key);
+  if (v != NULL)
+    return v;
+
+  if (find_entry(c, key, &k, &v)) {
+    if (cache_add(c->mem, k, v)) return NULL;
+    return v;
+  }
+  return NULL;
+}
+
+static void disk_destroy(cache *_c) {
+  disk_cache *c = (disk_cache *)_c;
+  cache_destroy(c->mem);
+  free((void *)c->dirp);
+}
+
+cache *cache_disk(const char *dirpath, cache *mem,
+                  kwrite_fn kwrite, vwrite_fn vwrite,
+                  kread_fn kread, vread_fn vread, error *e) {
+  struct stat st;
+  disk_cache *res;
+  char *dirp;
+  size_t dirl = strlen(dirpath);
+  char sep = '/';
+
+  /* This trickery is to make sure the path ends with a separator */
+#ifdef _WIN32
+  if (dirpath[dirl - 1] == '\\')
+    sep = '\\';
+#endif
+
+  if (dirpath[dirl - 1] != sep) dirl++;
+
+  dirp = malloc(dirl + 1);  /* With the NUL */
+
+  if (dirp == NULL) {
+    error_sys(e, "malloc");
+    return NULL;
+  }
+
+  strlcpy(dirp, dirpath, dirl + 1);
+
+  if (dirp[dirl - 1] != sep) {
+    dirp[dirl - 1] = sep;
+    dirp[dirl] = '\0';
+  }
+
+  if (ensurep(NULL, dirp) != 0) {
+    free(dirp);
+    error_sys(e, "ensurep");
+    return NULL;
+  }
+
+  /* For Windows mkdir and lstat which can't handle trailing separator */
+  dirp[dirl -  1] = '\0';
+
+  mkdir(dirp, 0777); /* This may fail, but it's ok */
+
+  if (lstat(dirp, &st) != 0) {
+    error_sys(e, "lstat");
+    return NULL;
+  }
+
+  /* Restore the good path at the end */
+  dirp[dirl - 1] = sep;
+
+  if (!(st.st_mode & S_IFDIR)) {
+    error_set(e, GA_SYS_ERROR, "Cache path exists but is not a directory");
+    return NULL;
+  }
+
+  res = calloc(sizeof(*res), 1);
+  if (res == NULL) {
+    error_sys(e, "calloc");
+    return NULL;
+  }
+
+  res->dirp = dirp;
+  res->mem = mem;
+  res->kwrite = kwrite;
+  res->vwrite = vwrite;
+  res->kread = kread;
+  res->vread = vread;
+  res->c.add = disk_add;
+  res->c.del = disk_del;
+  res->c.get = disk_get;
+  res->c.destroy = disk_destroy;
+  res->c.keq = mem->keq;
+  res->c.khash = mem->khash;
+  res->c.kfree = mem->kfree;
+  res->c.vfree = mem->vfree;
+  return (cache *)res;
+}
diff --git a/src/cache/lru.c b/src/cache/lru.c
index 597bbea999..a17cf5cf6c 100644
--- a/src/cache/lru.c
+++ b/src/cache/lru.c
@@ -114,23 +114,23 @@ struct _hash {
   size_t size;
 };
 
-static inline size_t roundup2(size_t s) {
+static inline unsigned long long roundup2(unsigned long long s) {
   s--;
   s |= s >> 1;
   s |= s >> 2;
   s |= s >> 4;
   s |= s >> 8;
   s |= s >> 16;
-  if (sizeof(size_t) >= 8)
-    s |= s >> 32;
+  s |= s >> 32;
   s++;
   return s;
 }
 
-static inline int hash_init(hash *h, size_t size) {
+static inline int hash_init(hash *h, size_t size, error *e) {
   h->nbuckets = roundup2(size + (size/6));
   h->keyval = calloc(h->nbuckets, sizeof(*h->keyval));
   if (h->keyval == NULL) {
+    error_sys(e, "calloc");
     return -1;
   }
   h->size = 0;
@@ -276,11 +276,15 @@ static void lru_destroy(cache *_c) {
 
 cache *cache_lru(size_t max_size, size_t elasticity,
                  cache_eq_fn keq, cache_hash_fn khash,
-                 cache_freek_fn kfree, cache_freev_fn vfree) {
+                 cache_freek_fn kfree, cache_freev_fn vfree,
+                 error *e) {
   lru_cache *res = malloc(sizeof(*res));
-  if (res == NULL) return NULL;
+  if (res == NULL) {
+    error_sys(e, "malloc");
+    return NULL;
+  }
 
-  if (hash_init(&res->data, max_size+elasticity)) {
+  if (hash_init(&res->data, max_size+elasticity, e)) {
     free(res);
     return NULL;
   }
diff --git a/src/cache/twoq.c b/src/cache/twoq.c
index 0309484f08..ea33be0b63 100644
--- a/src/cache/twoq.c
+++ b/src/cache/twoq.c
@@ -1,6 +1,8 @@
 #include <assert.h>
 #include <stdlib.h>
 
+#include <gpuarray/error.h>
+
 #include "cache.h"
 #include "private_config.h"
 
@@ -122,23 +124,23 @@ struct _hash {
   size_t size;
 };
 
-static inline size_t roundup2(size_t s) {
+static inline unsigned long long roundup2(unsigned long long s) {
   s--;
   s |= s >> 1;
   s |= s >> 2;
   s |= s >> 4;
   s |= s >> 8;
   s |= s >> 16;
-  if (sizeof(size_t) >= 8)
-    s |= s >> 32;
+  s |= s >> 32;
   s++;
   return s;
 }
 
-static inline int hash_init(hash *h, size_t size) {
+static inline int hash_init(hash *h, size_t size, error *e) {
   h->nbuckets = roundup2(size + (size/6));
   h->keyval = calloc(h->nbuckets, sizeof(*h->keyval));
   if (h->keyval == NULL) {
+    error_sys(e, "calloc");
     return -1;
   }
   h->size = 0;
@@ -322,16 +324,21 @@ static void twoq_destroy(cache *_c) {
 }
 
 cache *cache_twoq(size_t hot_size, size_t warm_size, size_t cold_size,
-                 size_t elasticity, cache_eq_fn keq, cache_hash_fn khash,
-                 cache_freek_fn kfree, cache_freev_fn vfree) {
+                  size_t elasticity, cache_eq_fn keq, cache_hash_fn khash,
+                  cache_freek_fn kfree, cache_freev_fn vfree, error *e) {
   twoq_cache *res;
-  if (hot_size == 0 || warm_size == 0 || cold_size == 0)
+  if (hot_size == 0 || warm_size == 0 || cold_size == 0) {
+    error_set(e, GA_VALUE_ERROR, "cache_twoq: section size is 0");
     return NULL;
+  }
 
   res = malloc(sizeof(*res));
-  if (res == NULL) return NULL;
+  if (res == NULL) {
+    error_sys(e, "malloc");
+    return NULL;
+  }
 
-  if (hash_init(&res->data, hot_size+warm_size+cold_size+elasticity)) {
+  if (hash_init(&res->data, hot_size+warm_size+cold_size+elasticity, e)) {
     free(res);
     return NULL;
   }
diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h
new file mode 100644
index 0000000000..ed20a8eb1c
--- /dev/null
+++ b/src/cluda_cuda.h
@@ -0,0 +1,172 @@
+#ifndef CLUDA_H
+#define CLUDA_H
+#define local_barrier() __syncthreads()
+#define WITHIN_KERNEL extern "C" __device__
+#define KERNEL extern "C" __global__
+#define GLOBAL_MEM /* empty */
+#define LOCAL_MEM __shared__
+#define LOCAL_MEM_ARG /* empty */
+#define MAXFLOAT        3.402823466E+38F
+#ifdef NAN
+#undef NAN
+#endif
+#define NAN __int_as_float(0x7fffffff)
+/* NULL */
+#ifdef INFINITY
+#undef INFINITY
+#endif
+#define INFINITY __int_as_float(0x7f800000)
+#define HUGE_VALF INFINITY
+#define HUGE_VAL __longlong_as_double(0x7ff0000000000000)
+
+#define M_E            2.7182818284590452354
+#define M_LOG2E        1.4426950408889634074
+#define M_LOG10E       0.43429448190325182765
+#define M_LN2          0.69314718055994530942
+#define M_LN10         2.30258509299404568402
+#define M_PI           3.14159265358979323846
+#define M_PI_2         1.57079632679489661923
+#define M_PI_4         0.78539816339744830962
+#define M_1_PI         0.31830988618379067154
+#define M_2_PI         0.63661977236758134308
+#define M_2_SQRTPI     1.12837916709551257390
+#define M_SQRT2        1.41421356237309504880
+#define M_SQRT1_2      0.70710678118654752440
+#define LID_0 threadIdx.x
+#define LID_1 threadIdx.y
+#define LID_2 threadIdx.z
+#define LDIM_0 blockDim.x
+#define LDIM_1 blockDim.y
+#define LDIM_2 blockDim.z
+#define GID_0 blockIdx.x
+#define GID_1 blockIdx.y
+#define GID_2 blockIdx.z
+#define GDIM_0 gridDim.x
+#define GDIM_1 gridDim.y
+#define GDIM_2 gridDim.z
+#define ga_bool unsigned char
+#define ga_byte signed char
+#define ga_ubyte unsigned char
+#define ga_short short
+#define ga_ushort unsigned short
+#define ga_int int
+#define ga_uint unsigned int
+#define ga_long long long
+#define ga_ulong unsigned long long
+#define ga_float float
+#define ga_double double
+#define ga_size size_t
+#define ga_ssize ptrdiff_t
+#define GA_DECL_SHARED_PARAM(type, name)
+#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];
+#define GA_WARP_SIZE warpSize
+
+struct ga_half {
+  ga_ushort data;
+};
+
+static __device__ inline float ga_half2float(ga_half h) {
+  float r;
+  asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(r) : "h"(h.data));
+  return r;
+}
+static __device__ inline ga_half ga_float2half(float f) {
+  ga_half r;
+  asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(r.data) : "f"(f));
+  return r;
+}
+
+/* ga_int */
+#define atom_add_ig(a, b) atomicAdd(a, b)
+#define atom_add_il(a, b) atomicAdd(a, b)
+#define atom_xchg_ig(a, b) atomicExch(a, b)
+#define atom_xchg_il(a, b) atomicExch(a, b)
+/* ga_uint */
+#define atom_add_Ig(a, b) atomicAdd(a, b)
+#define atom_add_Il(a, b) atomicAdd(a, b)
+#define atom_xchg_Ig(a, b) atomicExch(a, b)
+#define atom_xchg_Il(a, b) atomicExch(a, b)
+/* ga_long */
+__device__ ga_long atom_add_lg(ga_long *addr, ga_long val) {
+  unsigned long long *waddr = (unsigned long long *)addr;
+  unsigned long long old = *waddr;
+  unsigned long long assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(waddr, assumed, (val + (ga_long)(assumed)));
+  } while (assumed != old);
+  return (ga_long)old;
+}
+#define atom_add_ll(a, b) atom_add_lg(a, b)
+__device__ ga_long atom_xchg_lg(ga_long *addr, ga_long val) {
+  unsigned long long res;
+  res = atomicExch((unsigned long long *)addr, val);
+  return (ga_long)res;
+}
+#define atom_xchg_ll(a, b) atom_xchg_lg(a, b)
+/* ga_ulong */
+#define atom_add_Lg(a, b) atomicAdd(a, b)
+#define atom_add_Ll(a, b) atomicAdd(a, b)
+#define atom_xchg_Lg(a, b) atomicExch(a, b)
+#define atom_xchg_Ll(a, b) atomicExch(a, b)
+/* ga_float */
+#define atom_add_fg(a, b) atomicAdd(a, b)
+#define atom_add_fl(a, b) atomicAdd(a, b)
+#define atom_xchg_fg(a, b) atomicExch(a, b)
+#define atom_xchg_fl(a, b) atomicExch(a, b)
+/* ga_double */
+#if __CUDA_ARCH__ < 600
+__device__ ga_double atom_add_dg(ga_double *addr, ga_double val) {
+  unsigned long long *waddr = (unsigned long long *)addr;
+  unsigned long long old = *waddr;
+  unsigned long long assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(waddr, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#define atom_add_dl(a, b) atom_add_dg(a, b)
+#else
+#define atom_add_dg(a, b) atomicAdd(a, b)
+#define atom_add_dl(a, b) atomicAdd(a, b)
+#endif
+__device__ ga_double atom_xchg_dg(ga_double *addr, ga_double val) {
+  unsigned long long res;
+  res = atomicExch((unsigned long long *)addr, __double_as_longlong(val));
+  return __longlong_as_double(res);
+}
+#define atom_xchg_dl(a, b) atom_xchg_dg(a, b)
+/* ga_half */
+__device__ ga_half atom_add_eg(ga_half *addr, ga_half val) {
+  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
+  ga_uint old, assumed, sum, new_;
+  ga_half tmp;
+  old = *base;
+  do {
+    assumed = old;
+    tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+    sum = ga_float2half(ga_half2float(val) + ga_half2float(tmp)).data;
+    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
+    old = atomicCAS(base, assumed, new_);
+  } while (assumed != old);
+  tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+  return tmp;
+}
+#define atom_add_el(a, b) atom_add_eg(a, b)
+
+__device__ ga_half atom_xchg_eg(ga_half *addr, ga_half val) {
+  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
+  ga_uint old, assumed, new_;
+  ga_half tmp;
+  old = *base;
+  do {
+    assumed = old;
+    new_ = __byte_perm(old, val.data, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
+    old = atomicCAS(base, assumed, new_);
+  } while (assumed != old);
+  tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+  return tmp;
+}
+#define atom_xchg_el(a, b) atom_xchg_eg(a, b)
+#endif
diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c
new file mode 100644
index 0000000000..ba3f88cadc
--- /dev/null
+++ b/src/cluda_cuda.h.c
@@ -0,0 +1,506 @@
+static const char cluda_cuda_h[] = {
+0x23, 0x69, 0x66, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x43, 0x4c, 0x55,
+0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61,
+0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x29,
+0x20, 0x5f, 0x5f, 0x73, 0x79, 0x6e, 0x63, 0x74, 0x68, 0x72, 0x65,
+0x61, 0x64, 0x73, 0x28, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, 0x5f, 0x4b,
+0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72,
+0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76,
+0x69, 0x63, 0x65, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65,
+0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f,
+0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x5f, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f, 0x42,
+0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x2f, 0x2a, 0x20, 0x65,
+0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f,
+0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65,
+0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f,
+0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, 0x74,
+0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x4d, 0x41, 0x58, 0x46, 0x4c, 0x4f, 0x41, 0x54, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x34, 0x30,
+0x32, 0x38, 0x32, 0x33, 0x34, 0x36, 0x36, 0x45, 0x2b, 0x33, 0x38,
+0x46, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41,
+0x4e, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41,
+0x4e, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, 0x5f,
+0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f,
+0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, 0x66,
+0x66, 0x66, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x55, 0x4c, 0x4c,
+0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20,
+0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x75,
+0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49,
+0x54, 0x59, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x49, 0x4e, 0x46, 0x49,
+0x4e, 0x49, 0x54, 0x59, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f,
+0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78,
+0x37, 0x66, 0x38, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x48, 0x55, 0x47, 0x45,
+0x5f, 0x56, 0x41, 0x4c, 0x46, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e,
+0x49, 0x54, 0x59, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x48, 0x55, 0x47, 0x45, 0x5f, 0x56, 0x41, 0x4c, 0x20, 0x5f,
+0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61,
+0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x30, 0x78,
+0x37, 0x66, 0x66, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
+0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x45, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x2e,
+0x37, 0x31, 0x38, 0x32, 0x38, 0x31, 0x38, 0x32, 0x38, 0x34, 0x35,
+0x39, 0x30, 0x34, 0x35, 0x32, 0x33, 0x35, 0x34, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4f, 0x47,
+0x32, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31,
+0x2e, 0x34, 0x34, 0x32, 0x36, 0x39, 0x35, 0x30, 0x34, 0x30, 0x38,
+0x38, 0x38, 0x39, 0x36, 0x33, 0x34, 0x30, 0x37, 0x34, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4f,
+0x47, 0x31, 0x30, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x30, 0x2e, 0x34, 0x33, 0x34, 0x32, 0x39, 0x34, 0x34, 0x38, 0x31,
+0x39, 0x30, 0x33, 0x32, 0x35, 0x31, 0x38, 0x32, 0x37, 0x36, 0x35,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f,
+0x4c, 0x4e, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x30, 0x2e, 0x36, 0x39, 0x33, 0x31, 0x34, 0x37, 0x31,
+0x38, 0x30, 0x35, 0x35, 0x39, 0x39, 0x34, 0x35, 0x33, 0x30, 0x39,
+0x34, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x4d, 0x5f, 0x4c, 0x4e, 0x31, 0x30, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x32, 0x2e, 0x33, 0x30, 0x32, 0x35, 0x38,
+0x35, 0x30, 0x39, 0x32, 0x39, 0x39, 0x34, 0x30, 0x34, 0x35, 0x36,
+0x38, 0x34, 0x30, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x31, 0x34, 0x31,
+0x35, 0x39, 0x32, 0x36, 0x35, 0x33, 0x35, 0x38, 0x39, 0x37, 0x39,
+0x33, 0x32, 0x33, 0x38, 0x34, 0x36, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f, 0x32, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x35,
+0x37, 0x30, 0x37, 0x39, 0x36, 0x33, 0x32, 0x36, 0x37, 0x39, 0x34,
+0x38, 0x39, 0x36, 0x36, 0x31, 0x39, 0x32, 0x33, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f,
+0x34, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30,
+0x2e, 0x37, 0x38, 0x35, 0x33, 0x39, 0x38, 0x31, 0x36, 0x33, 0x33,
+0x39, 0x37, 0x34, 0x34, 0x38, 0x33, 0x30, 0x39, 0x36, 0x32, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x31,
+0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x30, 0x2e, 0x33, 0x31, 0x38, 0x33, 0x30, 0x39, 0x38, 0x38,
+0x36, 0x31, 0x38, 0x33, 0x37, 0x39, 0x30, 0x36, 0x37, 0x31, 0x35,
+0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d,
+0x5f, 0x32, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x33, 0x36, 0x36, 0x31, 0x39,
+0x37, 0x37, 0x32, 0x33, 0x36, 0x37, 0x35, 0x38, 0x31, 0x33, 0x34,
+0x33, 0x30, 0x38, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x4d, 0x5f, 0x32, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x50, 0x49,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x31, 0x32, 0x38, 0x33,
+0x37, 0x39, 0x31, 0x36, 0x37, 0x30, 0x39, 0x35, 0x35, 0x31, 0x32,
+0x35, 0x37, 0x33, 0x39, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x32, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x34, 0x31,
+0x34, 0x32, 0x31, 0x33, 0x35, 0x36, 0x32, 0x33, 0x37, 0x33, 0x30,
+0x39, 0x35, 0x30, 0x34, 0x38, 0x38, 0x30, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54,
+0x31, 0x5f, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e,
+0x37, 0x30, 0x37, 0x31, 0x30, 0x36, 0x37, 0x38, 0x31, 0x31, 0x38,
+0x36, 0x35, 0x34, 0x37, 0x35, 0x32, 0x34, 0x34, 0x30, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f,
+0x30, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78,
+0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61,
+0x64, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x74,
+0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49,
+0x4d, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69,
+0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f,
+0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32,
+0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x7a,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49,
+0x44, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64,
+0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63,
+0x6b, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x62,
+0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d,
+0x5f, 0x30, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e,
+0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47,
+0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44,
+0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x72,
+0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f,
+0x6c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
+0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x73,
+0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f,
+0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67,
+0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68,
+0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75,
+0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67,
+0x6e, 0x65, 0x64, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69,
+0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74,
+0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69,
+0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66,
+0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f,
+0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62,
+0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, 0x7a,
+0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x70,
+0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45,
+0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50,
+0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20,
+0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f,
+0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59,
+0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, 0x5f,
+0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, 0x79,
+0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f,
+0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x77,
+0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, 0x74,
+0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
+0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73,
+0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a,
+0x7d, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20,
+0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20,
+0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x66, 0x6c, 0x6f, 0x61,
+0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66,
+0x6c, 0x6f, 0x61, 0x74, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
+0x66, 0x20, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6c,
+0x6f, 0x61, 0x74, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73,
+0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x66, 0x33,
+0x32, 0x2e, 0x66, 0x31, 0x36, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25,
+0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22,
+0x3d, 0x66, 0x22, 0x28, 0x72, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x68,
+0x22, 0x28, 0x68, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b,
+0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72,
+0x3b, 0x0a, 0x7d, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20,
+0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20,
+0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68,
+0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61,
+0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61,
+0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61,
+0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20,
+0x61, 0x73, 0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e,
+0x72, 0x6e, 0x2e, 0x66, 0x31, 0x36, 0x2e, 0x66, 0x33, 0x32, 0x20,
+0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e,
+0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x68, 0x22, 0x28, 0x72, 0x2e,
+0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x66, 0x22,
+0x28, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
+0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f,
+0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64,
+0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63,
+0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63,
+0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20,
+0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c,
+0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41,
+0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
+0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
+0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67,
+0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f,
+0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67,
+0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c,
+0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61,
+0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69,
+0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c,
+0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20,
+0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64,
+0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20,
+0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75,
+0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20,
+0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20,
+0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c,
+0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73,
+0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f,
+0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75,
+0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a,
+0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61,
+0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65,
+0x64, 0x2c, 0x20, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28,
+0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73,
+0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61,
+0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f,
+0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67,
+0x29, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61,
+0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c,
+0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64,
+0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c,
+0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20,
+0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c,
+0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67,
+0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f,
+0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72,
+0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67,
+0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f,
+0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20,
+0x76, 0x61, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
+0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e,
+0x67, 0x29, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68,
+0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67,
+0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c,
+0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69,
+0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74,
+0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66,
+0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69,
+0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c,
+0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f,
+0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f,
+0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65,
+0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64,
+0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64,
+0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72,
+0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65,
+0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75,
+0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64,
+0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67,
+0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f,
+0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a,
+0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f,
+0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72,
+0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65,
+0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67,
+0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20,
+0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61,
+0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c,
+0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20,
+0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53,
+0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73,
+0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75,
+0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67,
+0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20,
+0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f,
+0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61,
+0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a,
+0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28,
+0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20,
+0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
+0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c,
+0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62,
+0x6c, 0x65, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c,
+0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64,
+0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64,
+0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c,
+0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64,
+0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f,
+0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61,
+0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67,
+0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61,
+0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75,
+0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a,
+0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72,
+0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68,
+0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a,
+0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f,
+0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e,
+0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29,
+0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f,
+0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72,
+0x65, 0x73, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
+0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
+0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a,
+0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f,
+0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f,
+0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67,
+0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64,
+0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20,
+0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61,
+0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65,
+0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74,
+0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a,
+0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32,
+0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e,
+0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75,
+0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e,
+0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68,
+0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20,
+0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65,
+0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d,
+0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
+0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f,
+0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28,
+0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67,
+0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72,
+0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34,
+0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31,
+0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d,
+0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74,
+0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61,
+0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61,
+0x6c, 0x29, 0x20, 0x2b, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
+0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70,
+0x29, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f,
+0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f,
+0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28,
+0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64,
+0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78,
+0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32,
+0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c,
+0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43,
+0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73,
+0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f,
+0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c,
+0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20,
+0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20,
+0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d,
+0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28,
+0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64,
+0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78,
+0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34,
+0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64,
+0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a,
+0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20,
+0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67,
+0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64,
+0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20,
+0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61,
+0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65,
+0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74,
+0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a,
+0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32,
+0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e,
+0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75,
+0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a,
+0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74,
+0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d,
+0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64,
+0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73,
+0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b,
+0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d,
+0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72,
+0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e,
+0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f,
+0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26,
+0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31,
+0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29,
+0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28,
+0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d,
+0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a,
+0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28,
+0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20,
+0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70,
+0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62,
+0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c,
+0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f,
+0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26,
+0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33,
+0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29,
+0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
+0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
+0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65,
+0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00};
diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h
new file mode 100644
index 0000000000..8997fcb465
--- /dev/null
+++ b/src/cluda_opencl.h
@@ -0,0 +1,189 @@
+#ifndef CLUDA_H
+#define CLUDA_H
+#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)
+#define WITHIN_KERNEL /* empty */
+#define KERNEL __kernel
+#define GLOBAL_MEM __global
+#define LOCAL_MEM __local
+#define LOCAL_MEM_ARG __local
+/* NAN */
+#ifndef NULL
+  #define NULL ((void*)0)
+#endif
+/* INFINITY */
+#define LID_0 get_local_id(0)
+#define LID_1 get_local_id(1)
+#define LID_2 get_local_id(2)
+#define LDIM_0 get_local_size(0)
+#define LDIM_1 get_local_size(1)
+#define LDIM_2 get_local_size(2)
+#define GID_0 get_group_id(0)
+#define GID_1 get_group_id(1)
+#define GID_2 get_group_id(2)
+#define GDIM_0 get_num_groups(0)
+#define GDIM_1 get_num_groups(1)
+#define GDIM_2 get_num_groups(2)
+#define ga_bool uchar
+#define ga_byte char
+#define ga_ubyte uchar
+#define ga_short short
+#define ga_ushort ushort
+#define ga_int int
+#define ga_uint uint
+#define ga_long long
+#define ga_ulong ulong
+#define ga_float float
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define ga_double double
+#endif
+#define ga_size ulong
+#define ga_ssize long
+#define GA_DECL_SHARED_PARAM(type, name) , __local type *name
+#define GA_DECL_SHARED_BODY(type, name)
+#define GA_WARP_SIZE __GA_WARP_SIZE
+
+typedef struct _ga_half {
+  half data;
+} ga_half;
+
+#define ga_half2float(p) vload_half(0, &((p).data))
+static inline ga_half ga_float2half(ga_float f) {
+  ga_half r;
+  vstore_half_rte(f, 0, &r.data);
+  return r;
+}
+
+#pragma OPENCL_EXTENSION cl_khr_int64_base_atomics: enable
+
+#define gen_atom32_add(name, argtype, aspace)                     \
+  argtype name(volatile aspace argtype *, argtype);               \
+  argtype name(volatile aspace argtype *addr, argtype val) {      \
+    union {                                                       \
+      argtype a;                                                  \
+      int w;                                                      \
+    } p, n;                                                       \
+    int a;                                                        \
+    p.a = *addr;                                                  \
+    do {                                                          \
+      a = p.w;                                                    \
+      n.a = p.a + val;                                            \
+      p.w = atomic_cmpxchg((volatile aspace int *)addr, a, n.w);  \
+    } while (p.w != a);                                           \
+    return n.a;                                                   \
+  }
+
+#define gen_atom64_add(name, argtype, aspace)                     \
+  argtype name(volatile aspace argtype *, argtype);               \
+  argtype name(volatile aspace argtype *addr, argtype val) {      \
+    union {                                                       \
+      argtype a;                                                  \
+      long w;                                                     \
+    } p, n;                                                       \
+    long a;                                                       \
+    p.a = *addr;                                                  \
+    do {                                                          \
+      a = p.w;                                                    \
+      n.a = p.a + val;                                            \
+      p.w = atom_cmpxchg((volatile aspace long *)addr, a, n.w);   \
+    } while (p.w != a);                                           \
+    return n.a;                                                   \
+  }
+
+#define gen_atom64_xchg(name, argtype, aspace)                  \
+  argtype name(volatile aspace argtype *, argtype);             \
+  argtype name(volatile aspace argtype *addr, argtype val) {    \
+    union {                                                     \
+      argtype a;                                                \
+      long w;                                                   \
+    } p, n;                                                     \
+    n.a = val;                                                  \
+    p.w = atom_xchg((volatile aspace long *)addr, n.w);         \
+    return p.a;                                                 \
+  }
+
+/* ga_int */
+#define atom_add_ig(a, b) atomic_add(a, b)
+#define atom_add_il(a, b) atomic_add(a, b)
+#define atom_xchg_ig(a, b) atomic_xchg(a, b)
+#define atom_xchg_il(a, b) atomic_xchg(a, b)
+/* ga_uint */
+#define atom_add_Ig(a, b) atomic_add(a, b)
+#define atom_add_Il(a, b) atomic_add(a, b)
+#define atom_xchg_Ig(a, b) atomic_xchg(a, b)
+#define atom_xchg_Il(a, b) atomic_xchg(a, b)
+/* ga_float */
+gen_atom32_add(atom_add_fg, ga_float, global)
+gen_atom32_add(atom_add_fl, ga_float, local)
+#define atom_xchg_fg(a, b) atomic_xchg(a, b)
+#define atom_xchg_fl(a, b) atomic_xchg(a, b)
+
+#ifdef cl_khr_int64_base_atomics
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+/* ga_long */
+#define atom_add_lg(a, b) atom_add(a, b)
+#define atom_add_ll(a, b) atom_add(a, b)
+#define atom_xchg_lg(a, b) atom_xchg(a, b)
+#define atom_xchg_ll(a, b) atom_xchg(a, b)
+/* ga_ulong */
+#define atom_add_Lg(a, b) atom_add(a, b)
+#define atom_add_Ll(a, b) atom_add(a, b)
+#define atom_xchg_Lg(a, b) atom_xchg(a, b)
+#define atom_xchg_Ll(a, b) atom_xchg(a, b)
+/* ga_double */
+#ifdef cl_khr_fp64
+gen_atom64_add(atom_add_dg, ga_double, global)
+gen_atom64_add(atom_add_dl, ga_double, local)
+gen_atom64_xchg(atom_xchg_dg, ga_double, global)
+gen_atom64_xchg(atom_xchg_dl, ga_double, local)
+#endif
+#endif
+/* ga_half */
+#define gen_atomh_add(name, aspace)                                     \
+  ga_half name(volatile aspace ga_half *addr, ga_half val);             \
+  ga_half name(volatile aspace ga_half *addr, ga_half val) {            \
+    ga_uint idx = ((ga_size)addr & 2) >> 1;                             \
+    volatile aspace int *base = (volatile aspace int *)((ga_size)addr & ~2); \
+    union {                                                             \
+      int i;                                                            \
+      ga_half h[2];                                                     \
+    } o, a, n;                                                          \
+    float fo;                                                           \
+    float fval;                                                         \
+    fval = ga_half2float(val);                                          \
+    o.i = *base;                                                        \
+    do {                                                                \
+      a.i = o.i;                                                        \
+      fo = ga_half2float(o.h[idx]);                                     \
+      n.i = o.i;                                                        \
+      n.h[idx] = ga_float2half(fval + fo);                              \
+      o.i = atomic_cmpxchg(base, a.i, n.i);                             \
+    } while (o.i != a.i);                                               \
+    return n.h[idx];                                                    \
+  }
+
+#define gen_atomh_xchg(name, aspace)                                    \
+  ga_half name(volatile aspace ga_half *addr, ga_half val);             \
+  ga_half name(volatile aspace ga_half *addr, ga_half val) {            \
+    ga_uint idx = ((ga_size)addr & 2) >> 1;                             \
+    volatile aspace int *base = (volatile aspace int *)((ga_size)addr & ~2); \
+    union {                                                             \
+      int i;                                                            \
+      ga_half h[2];                                                     \
+    } o, a, n;                                                          \
+    o.i = *base;                                                        \
+    do {                                                                \
+      a.i = o.i;                                                        \
+      n.i = o.i;                                                        \
+      n.h[idx] = val;                                                   \
+      o.i = atomic_cmpxchg(base, a.i, n.i);                             \
+    } while (o.i != a.i);                                               \
+    return o.h[idx];                                                    \
+  }
+
+gen_atomh_add(atom_add_eg, global)
+gen_atomh_add(atom_add_el, local)
+gen_atomh_xchg(atom_xchg_eg, global)
+gen_atomh_xchg(atom_xchg_el, local)
+
+#endif
diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c
new file mode 100644
index 0000000000..4d6785c40c
--- /dev/null
+++ b/src/cluda_opencl.h.c
@@ -0,0 +1,768 @@
+static const char cluda_opencl_h[] = {
+0x23, 0x69, 0x66, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x43, 0x4c, 0x55,
+0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61,
+0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x29,
+0x20, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x43, 0x4c,
+0x4b, 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d,
+0x5f, 0x46, 0x45, 0x4e, 0x43, 0x45, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e,
+0x5f, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x2f, 0x2a, 0x20,
+0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45,
+0x4c, 0x20, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f,
+0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x67,
+0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45,
+0x4d, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41,
+0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, 0x5f,
+0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x2f, 0x2a, 0x20, 0x4e,
+0x41, 0x4e, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x6e, 0x64,
+0x65, 0x66, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x0a, 0x20, 0x20, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x55, 0x4c, 0x4c,
+0x20, 0x28, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x2a, 0x29, 0x30, 0x29,
+0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20,
+0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x2a, 0x2f,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49,
+0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63,
+0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31,
+0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f,
+0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65,
+0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28,
+0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f,
+0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28,
+0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f,
+0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28,
+0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f,
+0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28,
+0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67,
+0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44,
+0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75,
+0x70, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20,
+0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69,
+0x64, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65,
+0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70,
+0x73, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65,
+0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70,
+0x73, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65,
+0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70,
+0x73, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75,
+0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x63,
+0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75,
+0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20,
+0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72,
+0x74, 0x20, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e,
+0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20,
+0x75, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c,
+0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75,
+0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20,
+0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65,
+0x66, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x66, 0x70,
+0x36, 0x34, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20,
+0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x20, 0x45, 0x58, 0x54, 0x45,
+0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68,
+0x72, 0x5f, 0x66, 0x70, 0x36, 0x34, 0x3a, 0x20, 0x65, 0x6e, 0x61,
+0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20,
+0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x65, 0x6e, 0x64,
+0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f,
+0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f,
+0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41,
+0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74,
+0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20,
+0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74,
+0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44,
+0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f,
+0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20,
+0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f,
+0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57,
+0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x74,
+0x79, 0x70, 0x65, 0x64, 0x65, 0x66, 0x20, 0x73, 0x74, 0x72, 0x75,
+0x63, 0x74, 0x20, 0x5f, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66,
+0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x64,
+0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68,
+0x61, 0x6c, 0x66, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32,
+0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c,
+0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c,
+0x20, 0x26, 0x28, 0x28, 0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61,
+0x29, 0x29, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69,
+0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61,
+0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74,
+0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c,
+0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a,
+0x20, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61,
+0x6c, 0x66, 0x5f, 0x72, 0x74, 0x65, 0x28, 0x66, 0x2c, 0x20, 0x30,
+0x2c, 0x20, 0x26, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b,
+0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72,
+0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d,
+0x61, 0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58,
+0x54, 0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f,
+0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62,
+0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73,
+0x3a, 0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f,
+0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28,
+0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79,
+0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c,
+0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20,
+0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69,
+0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61,
+0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61,
+0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70,
+0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61,
+0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61,
+0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70,
+0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e,
+0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61,
+0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a,
+0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61,
+0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61,
+0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64,
+0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20,
+0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c,
+0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20,
+0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d,
+0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61,
+0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72,
+0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68,
+0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d,
+0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65,
+0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64,
+0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67,
+0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63,
+0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70,
+0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61,
+0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c,
+0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74,
+0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f,
+0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61,
+0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20,
+0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74,
+0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65,
+0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20,
+0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20,
+0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70,
+0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20,
+0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e,
+0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c,
+0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20,
+0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c,
+0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e,
+0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d,
+0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61,
+0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64,
+0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b,
+0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
+0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20,
+0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20,
+0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f,
+0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
+0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73,
+0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70,
+0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61,
+0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c,
+0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70,
+0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61,
+0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61,
+0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70,
+0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f,
+0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72,
+0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f,
+0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c,
+0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20,
+0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20,
+0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67,
+0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20,
+0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67,
+0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e,
+0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20,
+0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61,
+0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64,
+0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63,
+0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f,
+0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a,
+0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69,
+0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69,
+0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f,
+0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61,
+0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61,
+0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c,
+0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20,
+0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e,
+0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64,
+0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66,
+0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74,
+0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c,
+0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f,
+0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f,
+0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62,
+0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73,
+0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50,
+0x45, 0x4e, 0x43, 0x4c, 0x20, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53,
+0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f,
+0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f,
+0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e,
+0x61, 0x62, 0x6c, 0x65, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61,
+0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f,
+0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
+0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67,
+0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c,
+0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61,
+0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68,
+0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67,
+0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f,
+0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f,
+0x6b, 0x68, 0x72, 0x5f, 0x66, 0x70, 0x36, 0x34, 0x0a, 0x67, 0x65,
+0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64,
+0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f,
+0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62,
+0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29,
+0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34,
+0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61,
+0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64,
+0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61,
+0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d,
+0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20,
+0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20,
+0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e,
+0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68,
+0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67,
+0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75,
+0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29,
+0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x65, 0x6e,
+0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68,
+0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f,
+0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65,
+0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61,
+0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c,
+0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63,
+0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a,
+0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61,
+0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c,
+0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20,
+0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69,
+0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67,
+0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64,
+0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20,
+0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69,
+0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73,
+0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20,
+0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76,
+0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70,
+0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61,
+0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74,
+0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20,
+0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f,
+0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26,
+0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20,
+0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f,
+0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20,
+0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20,
+0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f,
+0x61, 0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c,
+0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d,
+0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c,
+0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61,
+0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64,
+0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e,
+0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, 0x20,
+0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f,
+0x61, 0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d,
+0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e,
+0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20,
+0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61,
+0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x66,
+0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69,
+0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63,
+0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65,
+0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29,
+0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20,
+0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20,
+0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69,
+0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, 0x5b,
+0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61,
+0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e,
+0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f,
+0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76,
+0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70,
+0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66,
+0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f,
+0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
+0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61,
+0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65,
+0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61,
+0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
+0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a,
+0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74,
+0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61,
+0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20,
+0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20,
+0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61,
+0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a,
+0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c,
+0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63,
+0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67,
+0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72,
+0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20,
+0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67,
+0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d,
+0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c,
+0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61,
+0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61,
+0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64,
+0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e,
+0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d,
+0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d,
+0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69,
+0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62,
+0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e,
+0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69,
+0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20,
+0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20,
+0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f,
+0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a,
+0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f,
+0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64,
+0x64, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61,
+0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d,
+0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63,
+0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f,
+0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f,
+0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, 0x20,
+0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e,
+0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67,
+0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
+0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a,
+0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00};
diff --git a/src/gen_types.py b/src/gen_types.py
index c9e356e9a9..0e87fe23f3 100644
--- a/src/gen_types.py
+++ b/src/gen_types.py
@@ -73,7 +73,7 @@ def add_type(name, C, sz):
       int16_t exp;
       uint16_t hi;
       uint32_t lo;
-    };
+    } s;
     uint128_t raw;
   } u;
 } ga_quad;
@@ -165,7 +165,6 @@ def add_type(name, sz):
  * List of all built-in types.
  */
 enum GPUARRAY_TYPES {
-  GA_POINTER = -2,
   GA_BUFFER = -1,
 % for i, v in sorted(TYPEMAP.items()):
   GA_${v[1].upper()} = ${i},
diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h
index 8cbad94098..a99366a7c4 100644
--- a/src/gpuarray/array.h
+++ b/src/gpuarray/array.h
@@ -6,12 +6,7 @@
  */
 
 #include <gpuarray/buffer.h>
-
-#ifdef _MSC_VER
-#ifndef inline
-#define inline __inline
-#endif
-#endif
+#include <gpuarray/util.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -196,6 +191,13 @@ static inline int GpuArray_CHKFLAGS(const GpuArray *a, int flags) {
  */
 #define GpuArray_ITEMSIZE(a) gpuarray_get_elsize((a)->typecode)
 
+/**
+ * Fix the flags of an array using the current strides and shape.
+ *
+ * \param a GpuArray to fix flags for
+ */
+GPUARRAY_PUBLIC void GpuArray_fix_flags(GpuArray *a);
+
 /**
  * Initialize and allocate a new empty (uninitialized data) array.
  *
@@ -266,11 +268,6 @@ GPUARRAY_PUBLIC int GpuArray_fromdata(GpuArray *a,
                                       const size_t *dims,
                                       const ssize_t *strides, int writeable);
 
-GPUARRAY_PUBLIC int GpuArray_copy_from_host(GpuArray *a,
-                                            gpucontext *ctx, void *buf, int typecode,
-                                            unsigned int nd, const size_t *dims,
-                                            const ssize_t *strides);
-
 /**
  * Initialize an array structure to provide a view of another.
  *
@@ -529,7 +526,7 @@ GPUARRAY_PUBLIC int GpuArray_copy(GpuArray *res, const GpuArray *a,
  * Source and target arrays must be contiguous.  This restriction may
  * be lifted in the future.
  *
- * \param r result array
+ * \param res result array
  * \param a array to transfer
  *
  * \return GA_NO_ERROR if the operation was succesful.
@@ -607,6 +604,45 @@ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a);
 
 GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a);
 
+/**
+ * @brief Computes simultaneously the maxima and the arguments of maxima over
+ * specified axes of the tensor.
+ *
+ * Returns two tensors of identical shape. Both tensors' axes are a subset of
+ * the axes of the original tensor. The axes to be reduced are specified by
+ * the caller, and the maxima and arguments of maxima are computed over them.
+ *
+ * @param [out] dstMax     The resulting tensor of maxima
+ * @param [out] dstArgmax  the resulting tensor of arguments at maxima
+ * @param [in]  src        The source tensor.
+ * @param [in]  reduxLen   The number of axes reduced. Must be >= 1 and
+ *                         <= src->nd.
+ * @param [in]  reduxList  A list of integers of length reduxLen, indicating
+ *                         the axes to be reduced. The order of the axes
+ *                         matters for dstArgmax index calculations. All
+ *                         entries in the list must be unique, >= 0 and
+ *                         < src->nd.
+ *                         
+ *                         For example, if a 5D-tensor is reduced with an axis
+ *                         list of [3,4,1], then reduxLen shall be 3, and the
+ *                         index calculation in every point shall take the form
+ *                         
+ *                             dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] +
+ *                                                i4 * src.shape[1]                +
+ *                                                i1
+ *                         
+ *                         where (i3,i4,i1) are the coordinates of the maximum-
+ *                         valued element within subtensor [i0,:,i2,:,:] of src.
+ * @return GA_NO_ERROR if the operation was successful, or a non-zero error
+ *         code otherwise.
+ */
+
+GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dstMax,
+                                          GpuArray*       dstArgmax,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h
index d43d07b348..f76b209a97 100644
--- a/src/gpuarray/blas.h
+++ b/src/gpuarray/blas.h
@@ -8,6 +8,12 @@
 extern "C" {
 #endif
 
+// only for vector-vector dot
+GPUARRAY_PUBLIC int GpuArray_rdot(GpuArray *X, GpuArray *Y,
+                                  GpuArray *Z, int nocopy);
+#define GpuArray_hdot GpuArray_rdot
+#define GpuArray_sdot GpuArray_rdot
+#define GpuArray_ddot GpuArray_rdot
 GPUARRAY_PUBLIC int GpuArray_rgemv(cb_transpose transA, double alpha,
                                    GpuArray *A, GpuArray *X, double beta,
                                    GpuArray *Y, int nocopy);
@@ -28,6 +34,7 @@ GPUARRAY_PUBLIC int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y,
 GPUARRAY_PUBLIC int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB,
                                            double alpha, GpuArray *A, GpuArray *B,
                                            double beta, GpuArray *C, int nocopy);
+#define GpuArray_hgemmBatch_3d GpuArray_rgemmBatch_3d
 #define GpuArray_sgemmBatch_3d GpuArray_rgemmBatch_3d
 #define GpuArray_dgemmBatch_3d GpuArray_rgemmBatch_3d
 
diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
index ee83a80f67..e8b06cca97 100644
--- a/src/gpuarray/buffer.h
+++ b/src/gpuarray/buffer.h
@@ -44,99 +44,184 @@ struct _gpukernel;
 typedef struct _gpukernel gpukernel;
 
 /**
- * \brief Gets information about the number of available platforms for the
+ * Gets information about the number of available platforms for the
  * backend specified in `name`.
- * \param name [const char*] the backend name
- * \param platcount [unsigned int*] will contain number of compatible platforms in host
- * \return int GA_NO_ERROR, if success
+ *
+ * \param name the backend name
+ * \param platcount will contain number of compatible
+ *                  platforms in host
+ *
+ * \return #GA_NO_ERROR, if success
  */
 GPUARRAY_PUBLIC int gpu_get_platform_count(const char* name,
                                            unsigned int* platcount);
 
 /**
- * \brief Gets information about the number of compatible devices on a specific
- * host's `platform` for the backend specified in `name`.
- * \param name [const char*] the backend name
- * \param platform [unsigned int] number for a platform in host
- * \param devcount [unsigned int*] will contain number of compatible devices in
- * `platform`
- * \return int GA_NO_ERROR, if success
+ * Gets information about the number of compatible devices on a
+ * specific host's `platform` for the backend specified in `name`.
+ *
+ * \param name the backend name
+ * \param platform number for a platform in host
+ * \param devcount will contain number of compatible devices in
+ *                 `platform`
+ *
+ * \return #GA_NO_ERROR, if success
  */
 GPUARRAY_PUBLIC int gpu_get_device_count(const char* name,
                                          unsigned int platform,
                                          unsigned int* devcount);
 
 
+/**
+ * Opaque structure that holds properties for the context.
+ */
+typedef struct _gpucontext_props gpucontext_props;
 
 /**
- * Create a context on the specified device.
+ * Allocate and initialized an instance of gpucontext_props.
  *
- * \warning This function is not thread-safe.
+ * Initialization is done with default values.
  *
- * \param name the backend name.
- * \param dev the device number.  The precise meaning of the device
- *            number is backend-dependent
- * \param flags see \ref context_flags "Context flags"
- * \param ret error return location.  Will be ignored if set to NULL.
+ * \param res pointer to storage space for the created object
  *
- * \returns An opaque pointer to the created context or NULL if an
- * error occured.
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
-GPUARRAY_PUBLIC gpucontext *gpucontext_init(const char *name, int dev,
-                                            int flags, int *ret);
+GPUARRAY_PUBLIC int gpucontext_props_new(gpucontext_props **res);
 
 /**
- * \defgroup context_flags Context flags
- * @{
+ * Set the device number for a CUDA device.
+ *
+ * \param p properties object
+ * \param devno device number
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
+GPUARRAY_PUBLIC int gpucontext_props_cuda_dev(gpucontext_props *p, int devno);
+
 
 /**
- * Let the backend decide on optimal parameters, using backend-defined
- * heuristics and defaults.
+ * Set the platform and device for OpenCL.
  *
- * This is the default (0) value.
+ * \param p properties object
+ * \param platno platform number
+ * \param devno device number
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
-#define GA_CTX_DEFAULT       0x00
+GPUARRAY_PUBLIC int gpucontext_props_opencl_dev(gpucontext_props *p,
+                                                int platno, int devno);
 
 /**
- * Optimize parameters for multi-thread performance.
+ * Set the scheduling mode for the device.
  *
- * May decrease overall performance in single-thread scenarios.
+ * \param p properties object
+ * \param sched scheduling mode.  One of \ref sched_modes "these".
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
+ */
+GPUARRAY_PUBLIC int gpucontext_props_sched(gpucontext_props *p, int sched);
+
+/** \defgroup sched_modes
+ * @{
  */
-#define GA_CTX_MULTI_THREAD  0x01
 
 /**
- * Optimize parameters for single-thread performance.
+ * Automatic scheduling, decide what to do depending on the workload,
+ * number of cores in the computer and other relevant factors. (default)
+ */
+#define GA_CTX_SCHED_AUTO   0
+
+/**
+ * Single-work scheduling.  Optimize for speed in a single process,
+ * with a single thread.  This is the fastest mode, but it may keep
+ * the CPU busy more than necessary.
+ */
+#define GA_CTX_SCHED_SINGLE 1
+
+/**
+ * Multi-work scheduling.  Try to not keep the CPU busy more than
+ * necessary and let other threads a chance at some CPU time.  This
+ * may increase the latency when waiting for GPU operations.
+ */
+#define GA_CTX_SCHED_MULTI  2
+
+/** @}*/
+
+/**
+ * Set single-stream mode.
+ *
+ * All operations on the device will be serialized on a single stream.
+ * This will also disable most of the interlocking normally done
+ * between multiple streams to keep everything in order.
  *
- * May decrease overall performace in multithread scenarios.
+ * This mode can be faster if you don't have a lot of device-level
+ * parallelism in your workload.
+ *
+ * \param p properties object
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
-#define GA_CTX_SINGLE_THREAD 0x02
+GPUARRAY_PUBLIC int gpucontext_props_set_single_stream(gpucontext_props *p);
 
 /**
- * Allocate a single stream per context, performing all operations in order.
+ * Set the path for the kernel cache.
  *
- * This will remove any attempt at exploiting parallelism in the
- * underlying device by performing unrelated operations concurrently
- * and/or out of order.
+ * The cache can be shared with other running instances, even on
+ * shared drives.
  *
- * This can help performance by removing the small cost paid for each
- * operation to keep everything coherent in the face of parallelism.
- * It can also hinder performance by not exploiting concurrency.
+ * \param p properties object
+ * \param path desired location of the kernel cache
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
-#define GA_CTX_SINGLE_STREAM 0x4
+GPUARRAY_PUBLIC int gpucontext_props_kernel_cache(gpucontext_props *p,
+                                                  const char *path);
 
 /**
- * Disable allocations cache (if any).
+ * Configure the allocation cache.
+ *
+ * The maximum size is also a limit on the total amount of memory
+ * allocated on the device.
  *
- * This will usually decrease performance by quite a bit, but will
- * enable better debugging of kernels that perform out of bounds
- * access.
+ * \param p properties object
+ * \param initial initial size of the cache
+ * \param max maximum size of the cache
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
-#define GA_CTX_DISABLE_ALLOCATION_CACHE 0x10
+GPUARRAY_PUBLIC int gpucontext_props_alloc_cache(gpucontext_props *p,
+                                                 size_t initial, size_t max);
 
 /**
- * @}
+ * Free a properties object.
+ *
+ * This should not be called on a properties object that has been
+ * passed to gpucontext_init().
+ *
+ * \param p properties object
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
+GPUARRAY_PUBLIC void gpucontext_props_del(gpucontext_props *p);
+
+/**
+ * Create a context on the specified device.
+ *
+ * \warning This function is not thread-safe.
+ *
+ * The passed-in properties pointer will be managed by this function
+ * and needs not be freed.  This means that you shouldn't touch the
+ * properties object after passing it to this function.
+ *
+ * \param res a pointer to a location that will be allocated
+ * \param name the backend name.
+ * \param props a properties object for the context.  Can be NULL for
+ *              defaults.
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
+ */
+GPUARRAY_PUBLIC int gpucontext_init(gpucontext **res, const char *name,
+                                    gpucontext_props *props);
 
 /**
  * Dereference a context.
@@ -328,9 +413,9 @@ GPUARRAY_PUBLIC int gpudata_move(gpudata *dst, size_t dstoff,
  * \returns the new buffer in dst_ctx or NULL if no efficient way to
  *          transfer could be found.
  */
-GPUARRAY_LOCAL int gpudata_transfer(gpudata *dst, size_t dstoff,
-                                    gpudata *src, size_t srcoff,
-                                    size_t sz);
+GPUARRAY_PUBLIC int gpudata_transfer(gpudata *dst, size_t dstoff,
+                                     gpudata *src, size_t srcoff,
+                                     size_t sz);
 
 /**
  * Transfer data from a buffer to memory.
@@ -422,6 +507,8 @@ GPUARRAY_PUBLIC gpucontext *gpudata_context(gpudata *b);
  * \param strings table of string pointers
  * \param lengths (optional) length for each string in the table
  * \param fname name of the kernel function (as defined in the code)
+ * \param numargs number of kernel arguments
+ * \param typecodes the type of each argument
  * \param flags flags for compilation (see #ga_usefl)
  * \param ret error return pointer
  * \param err_str returns pointer to debug message from GPU backend
@@ -482,35 +569,17 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a);
  *
  * \param k kernel
  * \param n number of dimensions of grid/block
- * \param bs block sizes for this call (also known as local size)
  * \param gs grid sizes for this call (also known as global size)
+ * \param ls block sizes for this call (also known as local size)
  * \param shared amount of dynamic shared memory to reserve
  * \param args table of pointers to each argument (optional).
  *
  * \returns GA_NO_ERROR or an error code if an error occurred.
  */
 GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
-/**
- * Get the kernel binary.
- *
- * This can be use to cache kernel binaries after compilation of a
- * specific device.  The kernel can be recreated by calling
- * kernel_alloc with the binary and size and passing `GA_USE_BINARY`
- * as the use flags.
- *
- * The returned pointer is allocated and must be freed by the caller.
- *
- * \param k kernel
- * \param sz size of the returned binary
- * \param obj pointer to the binary for the kernel.
- *
- * \returns GA_NO_ERROR or an error code if an error occurred.
- */
-GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj);
-
 /**
  * Fetch a property.
  *
@@ -537,19 +606,11 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k);
 /**
  * Get the device name for the context.
  *
- * \note The returned string is allocated and must be freed by the caller.
- *
- * Type: `char *`
+ * Type: `char [256]`
  */
 #define GA_CTX_PROP_DEVNAME  1
 
-/**
- * Get the maximum block size (also known as local size) for a kernel
- * call in the context.
- *
- * Type: `size_t`
- */
-#define GA_CTX_PROP_MAXLSIZE 2
+/* UNUSED: 2 */
 
 /**
  * Get the local memory size available for a call in the context.
@@ -569,23 +630,9 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k);
  */
 #define GA_CTX_PROP_NUMPROCS 4
 
-/**
- * Get the maximum group size for a kernel call in this context.
- *
- * Type: `size_t`
- */
-#define GA_CTX_PROP_MAXGSIZE  5
+/* UNUSED: 5 */
 
-/**
- * Get the vector of blas ops for the context.
- *
- * This may differ from one context to the other in the same backend
- * depending of the availability and performance of various BLAS
- * libraries.
- *
- * Type: `const gpuarray_blas_ops *`
- */
-#define GA_CTX_PROP_BLAS_OPS  6
+/* UNUSED: 6 */
 
 /**
  * Get the compatibility ID for the binaries generated with this context.
@@ -673,12 +720,21 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k);
  */
 #define GA_CTX_PROP_MAXLSIZE2 17
 
+/* UNUSED: 18 */
+
 /**
- * Get the vector of collective ops for the context.
+ * Get a unique ID for the device behind the context.
  *
- * Type: `const gpuarray_comm_ops *`
+ * Type: `char [16]`
  */
-#define GA_CTX_PROP_COMM_OPS  18
+#define GA_CTX_PROP_UNIQUE_ID 19
+
+/**
+ * Get the largest single block of memory that can be allocted.
+ *
+ * Type: `size_t`
+ */
+#define GA_CTX_PROP_LARGEST_MEMBLOCK 20
 
 /* Start at 512 for GA_BUFFER_PROP_ */
 #define GA_BUFFER_PROP_START  512
@@ -766,10 +822,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k);
  * cases result in silent data corruption (especially on ATI cards).
  */
 typedef enum _ga_usefl {
-  /**
-   * The kernel source uses CLUDA unified language.
-   */
-  GA_USE_CLUDA =      0x01,
+  /* UNUSED: 0x01 */
   /**
    * The kernel makes use of small (size is smaller than 4 bytes) types.
    */
@@ -786,12 +839,6 @@ typedef enum _ga_usefl {
    * The kernel makes use of half-floats (also known as float16)
    */
   GA_USE_HALF =       0x10,
-  /**
-   * The source code passed is actually a kernel binary.
-   *
-   * For the cuda backend this can also be a PTX module.
-   */
-  GA_USE_BINARY =     0x20,
   /* If you add a new flag, don't forget to update both
      gpuarray_buffer_{cuda,opencl}.c with the implementation of your flag */
   /**
diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h
index 6e36c33f37..f7af64c478 100644
--- a/src/gpuarray/buffer_blas.h
+++ b/src/gpuarray/buffer_blas.h
@@ -38,6 +38,24 @@ GPUARRAY_PUBLIC void gpublas_teardown(gpucontext *ctx);
 
 GPUARRAY_PUBLIC const char *gpublas_error(gpucontext *ctx);
 
+GPUARRAY_PUBLIC int gpublas_hdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ);
+
+GPUARRAY_PUBLIC int gpublas_sdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ);
+
+GPUARRAY_PUBLIC int gpublas_ddot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ);
+
 GPUARRAY_PUBLIC int gpublas_hgemv(
   cb_order order, cb_transpose transA, size_t M, size_t N, float alpha,
   gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX,
@@ -97,6 +115,30 @@ GPUARRAY_PUBLIC int gpublas_hgemmBatch(
   float beta, gpudata **C, size_t *offC, size_t ldc,
   size_t batchCount, int flags);
 
+GPUARRAY_PUBLIC int gpublas_hgemm3D(
+  cb_order order, cb_transpose transA, cb_transpose transB,
+  size_t M, size_t N, size_t K, float alpha,
+  gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+  gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+  float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+  size_t batchCount, int flags);
+
+GPUARRAY_PUBLIC int gpublas_sgemm3D(
+  cb_order order, cb_transpose transA, cb_transpose transB,
+  size_t M, size_t N, size_t K, float alpha,
+  gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+  gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+  float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+  size_t batchCount, int flags);
+
+GPUARRAY_PUBLIC int gpublas_dgemm3D(
+  cb_order order, cb_transpose transA, cb_transpose transB,
+  size_t M, size_t N, size_t K, double alpha,
+  gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+  gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+  double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+  size_t batchCount, int flags);
+
 GPUARRAY_PUBLIC int gpublas_sgemmBatch(
   cb_order order, cb_transpose transA, cb_transpose transB,
   size_t M, size_t N, size_t K, float alpha,
diff --git a/src/gpuarray/buffer_collectives.h b/src/gpuarray/buffer_collectives.h
index a7b10b3d19..bad5561814 100644
--- a/src/gpuarray/buffer_collectives.h
+++ b/src/gpuarray/buffer_collectives.h
@@ -11,23 +11,23 @@ extern "C" {
 }
 #endif  // CONFUSE_EMACS
 
-/*******************************************************************************
-*                   Multi-gpu collectives buffer interface                    *
-*******************************************************************************/
+/*****************************************************************************
+*                   Multi-gpu collectives buffer interface                   *
+******************************************************************************/
 
 /**
  * Multi-gpu communicator structure.
- *
- * \note The contents are private.
  */
 struct _gpucomm;
 
 typedef struct _gpucomm gpucomm;
 
-/**
- * Enum for reduce ops of gpucomm
+/*
+ * \enum gpucomm_reduce_ops
+ *
+ * \brief Reduction operations
  */
-enum _gpucomm_reduce_ops {
+enum gpucomm_reduce_ops {
   GA_SUM = 0,   //!< to sum (elemwise) arrays across ranks
   GA_PROD = 1,  //!< to multiply (elemwise) arrays across ranks
   GA_MAX = 2,   //!< to find max (elemwise) of arrays across ranks
@@ -44,125 +44,133 @@ typedef struct _gpucommCliqueId {
 } gpucommCliqueId;
 
 /**
- * \brief Create a new gpu communicator instance.
- * \param comm [gpucomm**] pointer to get a new gpu communicator
- * \param ctx [gpucontext*] gpu context in which `comm` will be used (contains
- * device
- * information)
- * \param comm_id [gpucommCliqueId] id unique to communicators consisting a
- * world
- * \param ndev [int] number of communicators/devices participating in the world
- * \param rank [int] user-defined rank, from 0 to `ndev`-1, of `comm` in the
- * world
- * \note `rank` is defined to be unique for each new `comm` participating in the
- * same
- * world.
- * \note Must be called in parallel by all separate new `comm`, which will
- * consist a
- * new world (failing will lead to deadlock).
- * \return int error code, \ref GA_NO_ERROR if success
+ * Create a new gpu communicator instance.
+ *
+ * This must be called in parallel by all participants in the same
+ * world.  The call will block until all participants have joined in.
+ * The world is defined by a shared comm_id.
+ *
+ * \param comm pointer to get a new gpu communicator
+ * \param ctx gpu context in which `comm` will be used
+ *            (contains device information)
+ * \param comm_id id unique to communicators consisting a world
+ * \param ndev number of communicators/devices participating in the world
+ * \param rank user-defined rank, from 0 to `ndev`-1.  Must be unique
+ *             for the world.
+ *
+ * \returns error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_new(gpucomm** comm, gpucontext* ctx,
                                 gpucommCliqueId comm_id, int ndev, int rank);
 
 /**
- * \brief Destroy a gpu communicator instance.
- * \param comm [gpucomm*] gpu communicator to be destroyed
- * \return void
+ * Destroy a gpu communicator instance.
+ *
+ * \param comm gpu communicator to be destroyed
  */
 GPUARRAY_PUBLIC void gpucomm_free(gpucomm* comm);
 
 /**
- * \brief Returns nice error message concerning \ref GA_COMM_ERROR.
- * \param ctx [gpucontext*] gpu context in which communicator was used
- * \return const char* useful backend error message
+ * Returns nice error message concerning \ref GA_COMM_ERROR.
+ *
+ * \param ctx gpu context in which communicator was used
+ *
+ * \returns useful backend error message
  */
 GPUARRAY_PUBLIC const char* gpucomm_error(gpucontext* ctx);
 
 /**
- * \brief Returns gpu context in which `comm` is used.
- * \param comm [gpucomm*] gpu communicator
- * \return gpucontext* gpu context
+ * Returns gpu context in which `comm` is used.
+ *
+ * \param comm gpu communicator
+ *
+ * \returns gpu context
  */
 GPUARRAY_PUBLIC gpucontext* gpucomm_context(gpucomm* comm);
 
 /**
- * \brief Creates a unique `comm_id` to be shared in a world of communicators.
- * \param ctx [gpucontext*] gpu context
- * \param comm_id [gpucommCliqueId*] pointer to instance containing id
- * \note Id is guaranteed to be unique across callers in a single host.
- * \return int error code, \ref GA_NO_ERROR if success
+ * Creates a unique `comm_id`.
+ *
+ * The id is guarenteed to be unique in the same host, but not
+ * necessarily across hosts.
+ *
+ * \param ctx gpu context
+ * \param comm_id pointer to instance containing id
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_gen_clique_id(gpucontext* ctx,
                                           gpucommCliqueId* comm_id);
 
 /**
- * \brief Returns total number of device/communicators participating in `comm`'s
- * world.
- * \param comm [gpucomm*] gpu communicator
- * \param gpucount [int*] pointer to number of gpus in `comm`'s world
- * \return int error code, \ref GA_NO_ERROR if success
+ * Returns total number of devices participating in `comm`'s world.
+ *
+ * \param comm gpu communicator
+ * \param devcount pointer to store the number of devices
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
-GPUARRAY_PUBLIC int gpucomm_get_count(gpucomm* comm, int* gpucount);
+GPUARRAY_PUBLIC int gpucomm_get_count(gpucomm* comm, int* devcount);
 
 /**
- * \brief Returns rank of `comm` inside its world as defined by user upon
- * creation.
- * \param comm [gpucomm*] gpu communicator
- * \param rank [int*] pointer to `comm`'s rank
- * \return int error code, \ref GA_NO_ERROR if success
+ * Returns the rank of `comm` inside its world.
+ *
+ * \param comm gpu communicator
+ * \param rank pointer to store the rank
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_get_rank(gpucomm* comm, int* rank);
 
 /**
- * \brief Reduce collective operation for ranks in a communicator world [buffer
- * level].
- * \param src [gpudata*] data in device's buffer to be reduced
- * \param offsrc [size_t] memory offset after which data is saved in buffer
- * `src`
- * \param dest [gpudata*] data in device's buffer to collect result
- * \param offdest [size_t] memory offset after which data will be saved in
- * buffer
- * `dest`
- * \param count [size_t] number of elements to be reduced in each array
- * \param typecode [int] code for elements' data type, see \ref enum
- * GPUARRAY_TYPES
- * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops
- * \param root [int] rank in `comm` which will collect result
- * \param comm [gpucomm*] gpu communicator
- * \note Non root ranks can call this, using a NULL `dest`. In this case,
- * `offdest`
- * will not be used.
+ * Reduce collective operation for ranks in a communicator world
+ * [buffer level].
+ *
+ * \param src data in device's buffer to be reduced
+ * \param offsrc memory offset after which data is saved in buffer
+ *               `src`
+ * \param dest data in device's buffer to collect result
+ * \param offdest memory offset after which data will be saved in
+ *                buffer `dest`
+ * \param count number of elements to be reduced in each array
+ * \param typecode elements' data type
+ * \param opcode reduce operation code
+ * \param root rank in `comm` which will collect result
+ * \param comm gpu communicator
+ *
+ * \note Non root ranks can call this, using a NULL `dest`. In this
+ *       case, `offdest` will not be used.
+ *
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest,
                                    size_t offdest, size_t count, int typecode,
                                    int opcode, int root, gpucomm* comm);
 
 /**
- * \brief AllReduce collective operation for ranks in a communicator world
- * [buffer
- * level].
- *
- * Reduces data pointed by `src` using op operation and leaves identical copies
- * of
- * result in data pointed by `dest` on each rank of `comm`.
- *
- * \param src [gpudata*] data in device's buffer to be reduced
- * \param offsrc [size_t] memory offset after which data is saved in buffer
- * `src`
- * \param dest [gpudata*] data in device's buffer to collect result
- * \param offdest [size_t] memory offset after which data will be saved in
- * buffer
- * `dest`
- * \param count [size_t] number of elements to be reduced in each array
- * \param typecode [int] code for elements' data type, see \ref enum
- * GPUARRAY_TYPES
- * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops
- * \param comm [gpucomm*] gpu communicator
+ * AllReduce collective operation for ranks in a communicator world
+ * [buffer level].
+ *
+ * Reduces data pointed by `src` using op operation and leaves
+ * identical copies of result in data pointed by `dest` on each rank
+ * of `comm`.
+ *
+ * \param src data in device's buffer to be reduced
+ * \param offsrc memory offset after which data is saved in buffer
+ *               `src`
+ * \param dest data in device's buffer to collect result
+ * \param offdest memory offset after which data will be saved in
+ *                buffer `dest`
+ * \param count number of elements to be reduced in each array
+ * \param typecode elements' data type
+ * \param opcode reduce operation code (see #gpucomm_reduce_ops)
+ * \param comm gpu communicator
+ *
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc,
                                        gpudata* dest, size_t offdest,
@@ -170,28 +178,27 @@ GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc,
                                        gpucomm* comm);
 
 /**
- * \brief ReduceScatter collective operation for ranks in a communicator world
- * [buffer level].
+ * ReduceScatter collective operation for ranks in a communicator
+ * world [buffer level].
+ *
+ * Reduces data pointed by `src` using `opcode` operation and leaves
+ * reduced result scattered over data pointed by `dest` in the
+ * user-defined rank order in `comm`.
+ *
+ * \param src data in device's buffer to be reduced
+ * \param offsrc memory offset after which data is saved in buffer
+ *               `src`
+ * \param dest data in device's buffer to collect scattered result
+ * \param offdest memory offset after which data will be saved in
+ *                buffer `dest`
+ * \param count number of elements to be contained in result `dest`
+ * \param typecode elements' data type
+ * \param opcode reduce operation code (see #gpucomm_reduce_ops)
+ * \param comm gpu communicator
  *
- * Reduces data pointed by `src` using `opcode` operation and leaves reduced
- * result
- * scattered over data pointed by `dest` in the user-defined rank order in
- * `comm`.
- *
- * \param src [gpudata*] data in device's buffer to be reduced
- * \param offsrc [size_t] memory offset after which data is saved in buffer
- * `src`
- * \param dest [gpudata*] data in device's buffer to collect scattered result
- * \param offdest [size_t] memory offset after which data will be saved in
- * buffer
- * `dest`
- * \param count [size_t] number of elements to be contained in result `dest`
- * \param typecode [int] code for elements' data type, see \ref enum
- * GPUARRAY_TYPES
- * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops
- * \param comm [gpucomm*] gpu communicator
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc,
                                            gpudata* dest, size_t offdest,
@@ -199,44 +206,44 @@ GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc,
                                            int opcode, gpucomm* comm);
 
 /**
- * \brief Broadcast collective operation for ranks in a communicator world
- * [buffer
- * level].
+ * Broadcast collective operation for ranks in a communicator world
+ * [buffer level].
  *
  * Copies data pointed by `array` to all ranks in `comm`.
  *
- * \param array [gpudata*] data in device's buffer to get copied or be received
- * \param offset [size_t] memory offset after which data in `array` begin
- * \param count [size_t] number of elements to be contained in `array`
- * \param typecode [int] code for elements' data type, see \ref enum
- * GPUARRAY_TYPES
- * \param root [int] rank in `comm` which broadcasts its array
- * \param comm [gpucomm*] gpu communicator
+ * \param array data in device's buffer to get copied or be received
+ * \param offset memory offset after which data in `array` begin
+ * \param count number of elements to be contained in `array`
+ * \param typecode elements' data type
+ * \param root rank in `comm` which broadcasts its array
+ * \param comm gpu communicator
+ *
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_broadcast(gpudata* array, size_t offset,
                                       size_t count, int typecode, int root,
                                       gpucomm* comm);
 
 /**
- * \brief AllGather collective operation for ranks in a communicator world.
+ * AllGather collective operation for ranks in a communicator world.
  *
  * Each rank receives all data pointed by `src` of every rank in the
- * user-defined
- * rank order in `comm`.
- *
- * \param src [gpudata*] data in device's buffer to be gathered
- * \param offsrc [size_t] memory offset after which data in `src` begin
- * \param dest [gpudata*] data in device's buffer to gather from all ranks
- * \param offdest [size_t] memory offset after which data in `dest` begin
- * \param count [size_t] number of elements to be gathered from each rank in
- * `src`
- * \param typecode [int] code for elements' data type, see \ref enum
- * GPUARRAY_TYPES
- * \param comm [gpucomm*] gpu communicator
+ * user-defined rank order in `comm`.
+ *
+ * \param src data in device's buffer to be gathered
+ * \param offsrc memory offset after which data in `src` begin
+ * \param dest data in device's buffer to gather from all ranks
+ * \param offdest memory offset after which data in `dest` begin
+ * \param count number of elements to be gathered from each rank in
+ *              `src`
+ * \param typecode elements' data type
+ * \param comm gpu communicator
+ *
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int gpucomm_all_gather(gpudata* src, size_t offsrc,
                                        gpudata* dest, size_t offdest,
diff --git a/src/gpuarray/collectives.h b/src/gpuarray/collectives.h
index e9410c4c92..e1b776b68f 100644
--- a/src/gpuarray/collectives.h
+++ b/src/gpuarray/collectives.h
@@ -12,103 +12,114 @@ extern "C" {
 }
 #endif  // CONFUSE_EMACS
 
-/*******************************************************************************
-*                       Multi-gpu collectives interface                       *
-*******************************************************************************/
+/*****************************************************************************
+*                       Multi-gpu collectives interface                      *
+******************************************************************************/
 
 /**
- * \brief Reduce collective operation for non root participant ranks in a
+ * Reduce collective operation for non root participant ranks in a
  * communicator world.
- * \param src [const GpuArray*] array to be reduced
- * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops
- * \param root [int] rank in `comm` which will collect result
- * \param comm [gpucomm*] gpu communicator
- * \note Root rank of reduce operation must call \ref GpuArray_reduce.
+ *
+ * \param src array to be reduced
+ * \param opcode reduce operation code, see #gpucomm_reduce_ops
+ * \param root rank in `comm` which will collect result
+ * \param comm gpu communicator
+ *
+ * \note Root rank of reduce operation must call GpuArray_reduce().
  * \note Must be called separately for each rank in `comm`, except root rank.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int GpuArray_reduce_from(const GpuArray* src, int opcode,
                                          int root, gpucomm* comm);
 
 /**
- * \brief Reduce collective operation for ranks in a communicator world.
- * \param src [const GpuArray*] array to be reduced
- * \param dest [GpuArray*] array to collect reduce operation result
- * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops
- * \param root [int] rank in `comm` which will collect result
- * \param comm [gpucomm*] gpu communicator
+ * Reduce collective operation for ranks in a communicator world.
+ *
+ * \param src array to be reduced
+ * \param dest array to collect reduce operation result
+ * \param opcode reduce operation code, see #gpucomm_reduce_ops
+ * \param root rank in `comm` which will collect result
+ * \param comm gpu communicator
+ *
  * \note Can be used by root and non root ranks alike.
+ *
  * \note Non root ranks can call this, using a NULL `dest`.
- * \note Must be called separately for each rank in `comm` (non root can call
- * \ref
- * GpuArray_reduce_from instead).
- * \return int error code, \ref GA_NO_ERROR if success
+ * \note Must be called separately for each rank in `comm` (non root
+ *       can call GpuArray_reduce_from() instead).
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int GpuArray_reduce(const GpuArray* src, GpuArray* dest,
                                     int opcode, int root, gpucomm* comm);
 
 /**
- * \brief AllReduce collective operation for ranks in a communicator world.
+ * AllReduce collective operation for ranks in a communicator world.
+ *
+ * Reduces `src` using op operation and leaves identical copies of
+ * result in `dest` on each rank of `comm`.
  *
- * Reduces `src` using op operation and leaves identical copies of result in
- * `dest`
- * on each rank of `comm`.
+ * \param src array to be reduced
+ * \param dest array to collect reduce operation result
+ * \param opcode reduce operation code, see #gpucomm_reduce_ops
+ * \param comm gpu communicator
  *
- * \param src [const GpuArray*] array to be reduced
- * \param dest [GpuArray*] array to collect reduce operation result
- * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops
- * \param comm [gpucomm*] gpu communicator
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int GpuArray_all_reduce(const GpuArray* src, GpuArray* dest,
                                         int opcode, gpucomm* comm);
 
 /**
- * \brief ReduceScatter collective operation for ranks in a communicator world.
+ * ReduceScatter collective operation for ranks in a communicator world.
+ *
+ * Reduces data in `src` using `opcode` operation and leaves reduced
+ * result scattered over `dest` in the user-defined rank order in
+ * `comm`.
  *
- * Reduces data in `src` using `opcode` operation and leaves reduced result
- * scattered
- * over `dest` in the user-defined rank order in `comm`.
+ * \param src array to be reduced
+ * \param dest array to collect reduce operation scattered result
+ * \param opcode reduce operation code, see #gpucomm_reduce_ops
+ * \param comm gpu communicator
  *
- * \param src [const GpuArray*] array to be reduced
- * \param dest [GpuArray*] array to collect reduce operation scattered result
- * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops
- * \param comm [gpucomm*] gpu communicator
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest,
                                             int opcode, gpucomm* comm);
 
 /**
- * \brief Broadcast collective operation for ranks in a communicator world.
+ * Broadcast collective operation for ranks in a communicator world.
  *
  * Copies `array` to all ranks in `comm`.
  *
- * \param array [GpuArray*] array to be broadcasted, if root rank, else to
- * receive
- * \param root [int] rank in `comm` which broadcasts its array
- * \param comm [gpucomm*] gpu communicator
+ * \param array array to be broadcasted, if root rank, else to receive
+ * \param root rank in `comm` which broadcasts its array
+ * \param comm gpu communicator
+ *
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int GpuArray_broadcast(GpuArray* array, int root,
                                        gpucomm* comm);
 
 /**
- * \brief AllGather collective operation for ranks in a communicator world.
+ * AllGather collective operation for ranks in a communicator world.
  *
- * Each rank receives all `src` arrays from every rank in the user-defined rank
- * order
- * in `comm`.
+ * Each rank receives all `src` arrays from every rank in the
+ * user-defined rank order in `comm`.
  *
- * \param src [const GpuArray*] array to be gathered
- * \param dest [GpuArray*] array to receive all gathered arrays from ranks in
+ * \param src array to be gathered
+ * \param dest array to receive all gathered arrays from ranks in
  * `comm`
- * \param comm [gpucomm*] gpu communicator
+ * \param comm gpu communicator
+ *
  * \note Must be called separately for each rank in `comm`.
- * \return int error code, \ref GA_NO_ERROR if success
+ *
+ * \return error code or #GA_NO_ERROR if success
  */
 GPUARRAY_PUBLIC int GpuArray_all_gather(const GpuArray* src, GpuArray* dest,
                                         gpucomm* comm);
diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h
index 5e43074d79..a30639155a 100644
--- a/src/gpuarray/config.h
+++ b/src/gpuarray/config.h
@@ -1,6 +1,10 @@
 #ifndef GPUARRAY_CONFIG
 #define GPUARRAY_CONFIG
 
+/* The following included file should have been generated by CMake. */
+#include <gpuarray/abi_version.h>
+#define GPUARRAY_API_VERSION 2
+
 #ifdef GPUARRAY_SHARED
  #ifdef _WIN32
   #ifdef GPUARRAY_BUILDING_DLL
@@ -8,25 +12,26 @@
   #else
    #define GPUARRAY_PUBLIC __declspec(dllimport)
   #endif
-  #define GPUARRAY_LOCAL
  #else
   #if __GNUC__ >= 4
    #define GPUARRAY_PUBLIC __attribute__((visibility ("default")))
-   #define GPUARRAY_LOCAL  __attribute__((visibility ("hidden")))
   #else
-   #define GPUARRAY_PUBLIC
-   #define GPUARRAY_LOCAL
+   #error "Don't know how to export symbols on this platform"
   #endif
  #endif
 #else
  #define GPUARRAY_PUBLIC
- #define GPUARRAY_LOCAL
 #endif
 
 #ifdef _MSC_VER
 #include <stddef.h>
+#ifndef inline
+#define inline __inline
+#endif
 #if _MSC_VER < 1600
 #include <gpuarray/wincompat/stdint.h>
+#else
+#include <stdint.h>    
 #endif
 #define ssize_t intptr_t
 #define SSIZE_MAX INTPTR_MAX
diff --git a/src/gpuarray/elemwise.h b/src/gpuarray/elemwise.h
index bef99c2589..173ec0422c 100644
--- a/src/gpuarray/elemwise.h
+++ b/src/gpuarray/elemwise.h
@@ -156,6 +156,11 @@ GPUARRAY_PUBLIC int GpuElemwise_call(GpuElemwise *ge, void **args, int flags);
  */
 #define GE_NOCOLLAPSE  0x0200
 
+/**
+ * Allow implicit left-padding of shape with dimensions of size 1.
+ */
+#define GE_PADSHAPE   0x0400
+
 /**
  * @}
  */
diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h
index 1572145c0d..52aba986b0 100644
--- a/src/gpuarray/error.h
+++ b/src/gpuarray/error.h
@@ -1,6 +1,6 @@
 #ifndef GPUARRAY_ERROR_H
 #define GPUARRAY_ERROR_H
-/** \file error.h
+/** \file gpuarray/error.h
  *  \brief Error functions.
  */
 
@@ -34,6 +34,8 @@ enum ga_error {
   GA_NODEV_ERROR,
   GA_MISC_ERROR,
   GA_COMM_ERROR,
+  GA_XLARGE_ERROR,
+  GA_LOAD_ERROR,
   /* Add more error types if needed, but at the end */
   /* Don't forget to sync with Gpu_error() */
 };
diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h
index 2d6a9814cd..4b6377fa2b 100644
--- a/src/gpuarray/ext_cuda.h
+++ b/src/gpuarray/ext_cuda.h
@@ -11,15 +11,19 @@
 extern "C" {
 #endif
 
+/** @cond NEVER */
 static void (*cuda_enter)(gpucontext *);
 static void (*cuda_exit)(gpucontext *);
 static gpucontext *(*cuda_make_ctx)(CUcontext, int);
 static CUstream (*cuda_get_stream)(void *);
 static gpudata *(*cuda_make_buf)(void *, CUdeviceptr, size_t);
-static CUdeviceptr (*cuda_get_ptr)(gpudata *);
 static size_t (*cuda_get_sz)(gpudata *);
 static int (*cuda_wait)(gpudata *, int);
 static int (*cuda_record)(gpudata *, int);
+static CUipcMemHandle (*cuda_get_ipc_handle)(gpudata *d);
+static gpudata *(*cuda_open_ipc_handle)(gpucontext *c, CUipcMemHandle h,
+                                        size_t sz);
+/** @endcond */
 
 static void setup_ext_cuda(void) {
   // The casts are necessary to reassure C++ compilers
@@ -28,10 +32,11 @@ static void setup_ext_cuda(void) {
   cuda_make_ctx = (gpucontext *(*)(CUcontext, int))gpuarray_get_extension("cuda_make_ctx");
   cuda_get_stream = (CUstream (*)(void *))gpuarray_get_extension("cuda_get_stream");
   cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");
-  cuda_get_ptr = (CUdeviceptr (*)(gpudata *))gpuarray_get_extension("cuda_get_ptr");
   cuda_get_sz = (size_t (*)(gpudata *))gpuarray_get_extension("cuda_get_sz");
   cuda_wait = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_wait");
   cuda_record = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_record");
+  cuda_get_ipc_handle = (CUipcMemHandle (*)(gpudata *))gpuarray_get_extension("cuda_get_ipc_handle");
+  cuda_open_ipc_handle = (gpudata *(*)(gpucontext *c, CUipcMemHandle h, size_t sz))gpuarray_get_extension("cuda_open_ipc_handle");
 }
 
 #ifdef __cplusplus
diff --git a/src/gpuarray/extension.h b/src/gpuarray/extension.h
index b26b5231e5..6302cb3e33 100644
--- a/src/gpuarray/extension.h
+++ b/src/gpuarray/extension.h
@@ -19,6 +19,10 @@ extern "C" {
 #define GPUARRAY_CUDA_WAIT_READ  0x10000 /* CUDA_WAIT_READ */
 #define GPUARRAY_CUDA_WAIT_WRITE 0x20000 /* CUDA_WAIT_WRITE */
 
+typedef struct _GpuArrayIpcMemHandle {
+  char priv[64];
+} GpuArrayIpcMemHandle;
+
 /**
  * Obtain a function pointer for an extension.
  *
diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h
index 82d4f74edf..6ed8a476bd 100644
--- a/src/gpuarray/kernel.h
+++ b/src/gpuarray/kernel.h
@@ -41,6 +41,8 @@ typedef struct _GpuKernel {
  * \param strs C array of source code strings
  * \param lens C array with the size of each string or NULL
  * \param name name of the kernel function
+ * \param argcount number of kerner arguments
+ * \param types typecode for each argument
  * \param flags kernel use flags (see \ref ga_usefl)
  * \param err_str (if not NULL) location to write GPU-backend provided debug info 
  * 
@@ -87,29 +89,26 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val);
  *
  * \param k the kernel to schedule for
  * \param n number of elements to handle
- * \param ls local size (in/out)
  * \param gs grid size (in/out)
+ * \param ls local size (in/out)
  */
 GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
-                                    size_t *ls, size_t *gs);
+                                    size_t *gs, size_t *ls);
 
 /**
  * Launch the execution of a kernel.
  *
  * \param k the kernel to launch
  * \param n dimensionality of the grid/blocks
- * \param ls sizes of launch blocks
  * \param gs sizes of launch grid
- * \param amount of dynamic shared memory to allocate
+ * \param ls sizes of launch blocks
+ * \param shared amount of dynamic shared memory to allocate
  * \param args table of pointers to arguments
  */
 GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
-GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
-                                    void **obj);
-
 GPUARRAY_PUBLIC const char *GpuKernel_error(const GpuKernel *k, int err);
 
 #ifdef __cplusplus
diff --git a/src/gpuarray/types.h b/src/gpuarray/types.h
index afd0df16e4..2fac29bb37 100644
--- a/src/gpuarray/types.h
+++ b/src/gpuarray/types.h
@@ -43,7 +43,6 @@ typedef struct _gpuarray_type {
  * List of all built-in types.
  */
 enum GPUARRAY_TYPES {
-  GA_POINTER = -2,
   GA_BUFFER = -1,
   GA_BOOL = 0,
   GA_BYTE = 1,
diff --git a/src/gpuarray/util.h b/src/gpuarray/util.h
index e92919b538..a8a58ca4d3 100644
--- a/src/gpuarray/util.h
+++ b/src/gpuarray/util.h
@@ -15,9 +15,6 @@ extern "C" {
 #include <gpuarray/elemwise.h>
 #include <gpuarray/types.h>
 
-extern GPUARRAY_PUBLIC const int gpuarray_api_major;
-extern GPUARRAY_PUBLIC const int gpuarray_api_minor;
-
 /**
  * Registers a type with the kernel machinery.
  *
@@ -101,6 +98,90 @@ GPUARRAY_PUBLIC void gpuarray_elemwise_collapse(unsigned int n,
                                                 unsigned int *nd,
                                                 size_t *dim, ssize_t **strs);
 
+
+typedef struct _ga_half_t { uint16_t h; } ga_half_t;
+
+/* code strongly inspired from
+   https://github.com/numpy/numpy/blob/master/numpy/core/src/npymath/halffloat.c#L246 */
+
+static inline ga_half_t ga_float2half(float f) {
+  union {
+    float f;
+    uint32_t bits;
+  } bf;
+  union {
+    ga_half_t h;
+    uint16_t bits;
+  } bh;
+
+  uint32_t f_exp, f_sig;
+  uint16_t h_sgn, h_exp, h_sig;
+
+  bf.f = f;
+
+  h_sgn = (bf.bits&0x80000000u) >> 16;
+  f_exp = (bf.bits&0x7f800000u);
+
+  /* Exponent overflow/NaN converts to signed inf/NaN */
+  if (f_exp >= 0x47800000u) {
+    if (f_exp == 0x7f800000u) {
+      /* Inf or NaN */
+      f_sig = (bf.bits&0x007fffffu);
+      if (f_sig != 0) {
+	/* NaN - propagate the flag in the significand... */
+	bh.bits = (uint16_t) (0x7c00u + (f_sig >> 13));
+	/* ...but make sure it stays a NaN */
+	if (bh.bits == 0x7c00u) {
+	  bh.bits++;
+	}
+	bh.bits += h_sgn;
+	return bh.h;
+      } else {
+	/* signed inf */
+	bh.bits = h_sgn + 0x7c00u;
+	return bh.h;
+      }
+    } else {
+      bh.bits = h_sgn + 0x7c00u;
+      return bh.h;
+    }
+  }
+
+  if (f_exp <= 0x38000000u) {
+    /*
+     * Signed zeros, subnormal floats, and floats with small
+     * exponents all convert to signed zero halfs.
+     */
+    if (f_exp < 0x33000000u) {
+      bh.bits = h_sgn;
+      return bh.h;
+    }
+    /* Make the subnormal significand */
+    f_exp >>= 23;
+    f_sig = (0x00800000u + (bf.bits&0x007fffffu));
+    f_sig >>= (113 - f_exp);
+    /* Handle rounding by adding 1 to the bit beyond half precision */
+    f_sig += 0x00001000u;
+    h_sig = (uint16_t) (f_sig >> 13);
+    /*
+     * If the rounding causes a bit to spill into h_exp, it will
+     * increment h_exp from zero to one and h_sig will be zero.
+     * This is the correct result.
+     */
+    bh.bits = h_sgn + h_sig;
+    return bh.h;
+  }
+
+  /* Regular case with no overflow or underflow */
+  h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13);
+  /* Handle rounding by adding 1 to the bit beyond half precision */
+  f_sig = (bf.bits&0x007fffffu);
+  f_sig += 0x00001000u;
+  h_sig = (uint16_t) (f_sig >> 13);
+  bh.bits = h_sgn + h_exp + h_sig;
+  return bh.h;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c
index 73a6f6f4f6..ef97bbd476 100644
--- a/src/gpuarray_array.c
+++ b/src/gpuarray_array.c
@@ -5,19 +5,18 @@
 #include <assert.h>
 #include <stdarg.h>
 #include <stddef.h>
-#if _MSC_VER < 1600
-#include <stdint.h>
-#endif
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include "private.h"
+#include "gpuarray/config.h"
 #include "gpuarray/array.h"
 #include "gpuarray/error.h"
 #include "gpuarray/kernel.h"
 #include "gpuarray/util.h"
 
+#include "util/error.h"
 #include "util/strb.h"
 #include "util/xxhash.h"
 
@@ -42,12 +41,12 @@ static uint32_t extcopy_hash(cache_key_t k) {
 
 static int ga_extcopy(GpuArray *dst, const GpuArray *src) {
   struct extcopy_args a, *aa;
-  gpucontext *ctx = gpudata_context(dst->data);
+  gpucontext *ctx = GpuArray_context(dst);
   GpuElemwise *k = NULL;
   void *args[2];
 
-  if (ctx != gpudata_context(src->data))
-    return GA_INVALID_ERROR;
+  if (ctx != GpuArray_context(src))
+    return error_set(ctx->err, GA_INVALID_ERROR, "src and dst context differ");
 
   a.itype = src->typecode;
   a.otype = dst->typecode;
@@ -62,22 +61,24 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) {
     gargs[1].name = "dst";
     gargs[1].typecode = dst->typecode;
     gargs[1].flags = GE_WRITE;
-    k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, 0);
+    k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, GE_CONVERT_F16);
     if (k == NULL)
-      return GA_MISC_ERROR;
+      return ctx->err->code;
     aa = memdup(&a, sizeof(a));
     if (aa == NULL) {
       GpuElemwise_free(k);
-      return GA_MEMORY_ERROR;
+      return error_sys(ctx->err, "memdup");
     }
     if (ctx->extcopy_cache == NULL)
       ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash,
                                       extcopy_free,
-                                      (cache_freev_fn)GpuElemwise_free);
+                                      (cache_freev_fn)GpuElemwise_free,
+                                      ctx->err);
     if (ctx->extcopy_cache == NULL)
-      return GA_MISC_ERROR;
+      return ctx->err->code;
     if (cache_add(ctx->extcopy_cache, aa, k) != 0)
-      return GA_MISC_ERROR;
+      return error_set(ctx->err, GA_MISC_ERROR,
+                       "Could not store GpuElemwise copy kernel in context cache");
   }
   args[0] = (void *)src;
   args[1] = (void *)dst;
@@ -85,34 +86,55 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) {
 }
 
 /* Value below which a size_t multiplication will never overflow. */
-#define MUL_NO_OVERFLOW (1UL << (sizeof(size_t) * 4))
+#define MUL_NO_OVERFLOW (1ULL << (sizeof(size_t) * 4))
 
-int GpuArray_empty(GpuArray *a, gpucontext *ctx,
-		   int typecode, unsigned int nd, const size_t *dims,
-                   ga_order ord) {
+void GpuArray_fix_flags(GpuArray *a) {
+  /* Only keep the writable flag */
+  a->flags &= GA_WRITEABLE;
+  /* Set the other flags if applicable */
+  if (GpuArray_is_c_contiguous(a)) a->flags |= GA_C_CONTIGUOUS;
+  if (GpuArray_is_f_contiguous(a)) a->flags |= GA_F_CONTIGUOUS;
+  if (GpuArray_is_aligned(a)) a->flags |= GA_ALIGNED;
+}
+
+int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode,
+                   unsigned int nd, const size_t *dims, ga_order ord) {
   size_t size = gpuarray_get_elsize(typecode);
   unsigned int i;
   int res = GA_NO_ERROR;
 
+  if (typecode == GA_SIZE || typecode == GA_SSIZE)
+    return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type");
+
   if (ord == GA_ANY_ORDER)
     ord = GA_C_ORDER;
 
   if (ord != GA_C_ORDER && ord != GA_F_ORDER)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Invalid order");
 
   for (i = 0; i < nd; i++) {
     size_t d = dims[i];
     /* Check for overflow */
     if ((d >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) &&
-	d > 0 && SIZE_MAX / d < size)
-      return GA_VALUE_ERROR;
+        d > 0 && SIZE_MAX / d < size)
+      return error_set(ctx->err, GA_XLARGE_ERROR, "Total array size greater than addressable space");
     size *= d;
   }
 
+  /* We add a offset of 64 to all arrays in DEBUG to help catch errors. */
+#ifdef DEBUG
+  assert(SIZE_MAX - size > 64);
+  size += 64;
+#endif
+
   a->data = gpudata_alloc(ctx, size, NULL, 0, &res);
-  if (a->data == NULL) return res;
+  if (a->data == NULL) return ctx->err->code;
   a->nd = nd;
+#ifdef DEBUG
+  a->offset = 64;
+#else
   a->offset = 0;
+#endif
   a->typecode = typecode;
   a->dimensions = calloc(nd, sizeof(size_t));
   a->strides = calloc(nd, sizeof(ssize_t));
@@ -120,7 +142,7 @@ int GpuArray_empty(GpuArray *a, gpucontext *ctx,
   a->flags = GA_BEHAVED;
   if (a->dimensions == NULL || a->strides == NULL) {
     GpuArray_clear(a);
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   }
   /* Mult will not overflow since calloc succeded */
   memcpy(a->dimensions, dims, sizeof(size_t)*nd);
@@ -169,8 +191,11 @@ int GpuArray_zeros(GpuArray *a, gpucontext *ctx,
 int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode,
                       unsigned int nd, const size_t *dims,
                       const ssize_t *strides, int writeable) {
-  if (gpuarray_get_type(typecode)->typecode != typecode)
-    return GA_VALUE_ERROR;
+  gpucontext *ctx = gpudata_context(data);
+
+  if (typecode == GA_SIZE || typecode == GA_SSIZE)
+    return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type");
+
   assert(data != NULL);
   a->data = data;
   gpudata_retain(a->data);
@@ -182,52 +207,18 @@ int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode,
   a->flags = (writeable ? GA_WRITEABLE : 0);
   if (a->dimensions == NULL || a->strides == NULL) {
     GpuArray_clear(a);
-    return GA_MEMORY_ERROR;
+    return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory");
   }
   memcpy(a->dimensions, dims, nd*sizeof(size_t));
   memcpy(a->strides, strides, nd*sizeof(ssize_t));
 
-  if (GpuArray_is_c_contiguous(a)) a->flags |= GA_C_CONTIGUOUS;
-  if (GpuArray_is_f_contiguous(a)) a->flags |= GA_F_CONTIGUOUS;
-  if (GpuArray_is_aligned(a)) a->flags |= GA_ALIGNED;
+  GpuArray_fix_flags(a);
 
   return GA_NO_ERROR;
 }
 
-int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf,
-                            int typecode, unsigned int nd, const size_t *dims,
-                            const ssize_t *strides) {
-  char *base = (char *)buf;
-  size_t offset = 0;
-  size_t size = gpuarray_get_elsize(typecode);
-  gpudata *b;
-  int err;
-  unsigned int i;
-
-  for (i = 0; i < nd; i++) {
-    if (dims[i] == 0) {
-      size = 0;
-      base = (char *)buf;
-      break;
-    }
-
-    if (strides[i] < 0)
-      base += (dims[i]-1) * strides[i];
-    else
-      size += (dims[i]-1) * strides[i];
-  }
-  offset = (char *)buf - base;
-  size += offset;
-
-  b = gpudata_alloc(ctx, size, base, GA_BUFFER_INIT, &err);
-  if (b == NULL) return err;
-
-  err = GpuArray_fromdata(a, b, offset, typecode, nd, dims, strides, 1);
-  gpudata_release(b);
-  return err;
-}
-
 int GpuArray_view(GpuArray *v, const GpuArray *a) {
+  gpucontext *ctx = GpuArray_context(a);
   v->data = a->data;
   gpudata_retain(a->data);
   v->nd = a->nd;
@@ -238,7 +229,7 @@ int GpuArray_view(GpuArray *v, const GpuArray *a) {
   v->strides = calloc(v->nd, sizeof(ssize_t));
   if (v->dimensions == NULL || v->strides == NULL) {
     GpuArray_clear(v);
-    return GA_MEMORY_ERROR;
+    return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory");
   }
   memcpy(v->dimensions, a->dimensions, v->nd*sizeof(size_t));
   memcpy(v->strides, a->strides, v->nd*sizeof(ssize_t));
@@ -251,6 +242,7 @@ int GpuArray_sync(GpuArray *a) {
 
 int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts,
                            const ssize_t *stops, const ssize_t *steps) {
+  gpucontext *ctx = GpuArray_context(a);
   unsigned int i, new_i;
   unsigned int new_nd = a->nd;
   size_t *newdims;
@@ -258,7 +250,7 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts,
   size_t new_offset = a->offset;
 
   if ((starts == NULL) || (stops == NULL) || (steps == NULL))
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Invalid slice (contains NULL)");
 
   for (i = 0; i < a->nd; i++) {
     if (steps[i] == 0) new_nd -= 1;
@@ -268,31 +260,40 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts,
   if (newdims == NULL || newstrs == NULL) {
     free(newdims);
     free(newstrs);
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   }
 
   new_i = 0;
   for (i = 0; i < a->nd; i++) {
     if (starts[i] < -1 || (starts[i] > 0 &&
-			   (size_t)starts[i] > a->dimensions[i])) {
+                           (size_t)starts[i] > a->dimensions[i])) {
       free(newdims);
       free(newstrs);
-      return GA_VALUE_ERROR;
+      return error_fmt(ctx->err, GA_VALUE_ERROR,
+                       "Invalid slice value: slice(%lld, %lld, %lld) when "
+                       "indexing array on dimension %u of length %lld",
+                       starts[i], stops[i], steps[i], i, a->dimensions[i]);
     }
     if (steps[i] == 0 &&
-	(starts[i] == -1 || starts[i] >= a->dimensions[i])) {
+        (starts[i] == -1 || (size_t)starts[i] >= a->dimensions[i])) {
       free(newdims);
       free(newstrs);
-      return GA_VALUE_ERROR;
+      return error_fmt(ctx->err, GA_VALUE_ERROR,
+                       "Invalid slice value: slice(%lld, %lld, %lld) when "
+                       "indexing array on dimension %u of length %lld",
+                       starts[i], stops[i], steps[i], i, a->dimensions[i]);
     }
     new_offset += starts[i] * a->strides[i];
     if (steps[i] != 0) {
       if ((stops[i] < -1 || (stops[i] > 0 &&
-			      (size_t)stops[i] > a->dimensions[i])) ||
-	  (stops[i]-starts[i])/steps[i] < 0) {
+                             (size_t)stops[i] > a->dimensions[i])) ||
+          (stops[i]-starts[i])/steps[i] < 0) {
         free(newdims);
         free(newstrs);
-	return GA_VALUE_ERROR;
+        return error_fmt(ctx->err, GA_VALUE_ERROR,
+                         "Invalid slice value: slice(%lld, %lld, %lld) when "
+                         "indexing array on dimension %u of length %lld",
+                         starts[i], stops[i], steps[i], i, a->dimensions[i]);
       }
       newstrs[new_i] = steps[i] * a->strides[i];
       newdims[new_i] = (stops[i]-starts[i]+steps[i]-
@@ -306,18 +307,7 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts,
   a->dimensions = newdims;
   free(a->strides);
   a->strides = newstrs;
-  if (GpuArray_is_c_contiguous(a))
-    a->flags |= GA_C_CONTIGUOUS;
-  else
-    a->flags &= ~GA_C_CONTIGUOUS;
-  if (GpuArray_is_f_contiguous(a))
-    a->flags |= GA_F_CONTIGUOUS;
-  else
-    a->flags &= ~GA_F_CONTIGUOUS;
-  if (GpuArray_is_aligned(a))
-    a->flags |= GA_ALIGNED;
-  else
-    a->flags &= ~GA_ALIGNED;
+  GpuArray_fix_flags(a);
 
   return GA_NO_ERROR;
 }
@@ -337,17 +327,17 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                             const GpuArray *ind, int addr32) {
   strb sb = STRB_STATIC_INIT;
   int *atypes;
-  size_t nargs, apos;
   char *sz, *ssz;
   unsigned int i, i2;
-  int flags = GA_USE_CLUDA;
+  unsigned int nargs, apos;
+  int flags = 0;
   int res;
 
-  nargs = 7 + 2 * v->nd;
+  nargs = 9 + 2 * v->nd;
 
   atypes = calloc(nargs, sizeof(int));
   if (atypes == NULL)
-    return GA_MEMORY_ERROR;
+    return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory");
 
   if (addr32) {
     sz = "ga_uint";
@@ -358,11 +348,13 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
   }
 
   apos = 0;
-  strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, "
-               "GLOBAL_MEM const %s *v, ga_size off,",
+  strb_appendf(&sb, "#include \"cluda.h\"\n"
+               "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, "
+               "GLOBAL_MEM const %s *v, ga_size v_off,",
                gpuarray_get_type(a->typecode)->cluda_name,
                gpuarray_get_type(v->typecode)->cluda_name);
   atypes[apos++] = GA_BUFFER;
+  atypes[apos++] = GA_SIZE;
   atypes[apos++] = GA_BUFFER;
   atypes[apos++] = GA_SIZE;
   for (i = 0; i < v->nd; i++) {
@@ -370,11 +362,13 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
     atypes[apos++] = GA_SSIZE;
     atypes[apos++] = GA_SIZE;
   }
-  strb_appends(&sb, " GLOBAL_MEM const ga_ssize *ind, ga_size n0, ga_size n1,"
-               " GLOBAL_MEM int* err) {\n");
+  strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, "
+               "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n",
+               gpuarray_get_type(ind->typecode)->cluda_name);
   atypes[apos++] = GA_BUFFER;
   atypes[apos++] = GA_SIZE;
   atypes[apos++] = GA_SIZE;
+  atypes[apos++] = GA_SIZE;
   atypes[apos++] = GA_BUFFER;
   assert(apos == nargs);
   strb_appendf(&sb, "  const %s idx0 = LDIM_0 * GID_0 + LID_0;\n"
@@ -382,17 +376,22 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                "  const %s idx1 = LDIM_1 * GID_1 + LID_1;\n"
                "  const %s numThreads1 = LDIM_1 * GDIM_1;\n"
                "  %s i0, i1;\n", sz, sz, sz, sz, sz);
+  strb_appends(&sb, "  if (idx0 >= n0 || idx1 >= n1) return;\n");
+  strb_appendf(&sb, "  r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n"
+               "  ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n",
+               gpuarray_get_type(a->typecode)->cluda_name,
+               gpuarray_get_type(ind->typecode)->cluda_name);
   strb_appendf(&sb, "  for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n"
                "    %s ii0 = ind[i0];\n"
-               "    %s pos0 = off;\n"
+               "    %s pos0 = v_off;\n"
                "    if (ii0 < 0) ii0 += d0;\n"
-               "    if ((ii0 < 0) || (ii0 >= d0)) {\n"
+               "    if ((ii0 < 0) || (ii0 >= (%s)d0)) {\n"
                "      *err = -1;\n"
                "      continue;\n"
                "    }\n"
                "    pos0 += ii0 * (%s)s0;\n"
                "    for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n"
-               "      %s p = pos0;\n", ssz, sz, sz, sz);
+               "      %s p = pos0;\n", ssz, sz, ssz, sz, sz);
   if (v->nd > 1) {
     strb_appendf(&sb, "      %s pos, ii = i1;\n", sz);
     for (i2 = v->nd; i2 > 1; i2--) {
@@ -411,7 +410,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                "  }\n"
                "}\n");
   if (strb_error(&sb)) {
-    res = GA_MEMORY_ERROR;
+    res = error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory");
     goto bail;
   }
   flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1);
@@ -425,40 +424,52 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
 
 int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
                    int check_error) {
+  gpucontext *ctx = GpuArray_context(a);
   size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0};
   size_t pl;
   gpudata *errbuf;
 #if DEBUG
   char *errstr = NULL;
 #endif
-  size_t argp;
   GpuKernel k;
   unsigned int j;
+  unsigned int argp;
   int err, kerr = 0;
   int addr32 = 0;
 
   if (!GpuArray_ISWRITEABLE(a))
-    return GA_INVALID_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Destination array not writeable");
 
   if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) ||
       !GpuArray_ISALIGNED(i))
-    return GA_UNALIGNED_ERROR;
+    return error_fmt(ctx->err, GA_UNALIGNED_ERROR,
+                     "Not all arrays are aligned: a (%d), b (%d), i (%d)",
+                     GpuArray_ISALIGNED(a), GpuArray_ISALIGNED(v), GpuArray_ISALIGNED(i));
 
   /* a and i have to be C contiguous */
-  if (!GpuArray_IS_C_CONTIGUOUS(a) || !GpuArray_IS_C_CONTIGUOUS(i))
-    return GA_INVALID_ERROR;
+  if (!GpuArray_IS_C_CONTIGUOUS(a))
+    return error_set(ctx->err, GA_INVALID_ERROR, "Destination array (a) not C-contiguous");
+  if (!GpuArray_IS_C_CONTIGUOUS(i))
+    return error_set(ctx->err, GA_INVALID_ERROR, "Index array (i) not C-contiguous");
 
   /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */
-  if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd ||
-      a->dimensions[0] != i->dimensions[0])
-    return GA_INVALID_ERROR;
+  if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd)
+    return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. "
+                     "v->nd = %llu, a->nd = %llu, i->nd = %llu",
+                     v->nd, a->nd, i->nd);
+  if (a->dimensions[0] != i->dimensions[0])
+    return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. "
+                     "a->dimensions[0] = %llu, i->dimensions[0] = %llu",
+                     a->dimensions[0], i->dimensions[0]);
 
   n[0] = i->dimensions[0];
   n[1] = 1;
 
   for (j = 1; j < v->nd; j++) {
     if (a->dimensions[j] != v->dimensions[j])
-      return GA_INVALID_ERROR;
+      return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. "
+                       "a->dimensions[%llu] = %llu, i->dimensions[%llu] = %llu",
+                       j, a->dimensions[j], j, i->dimensions[j]);
     n[1] *= v->dimensions[j];
   }
 
@@ -470,7 +481,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   if (err != GA_NO_ERROR)
     return err;
 
-  err = gen_take1_kernel(&k, GpuArray_context(a),
+  err = gen_take1_kernel(&k, ctx,
 #if DEBUG
                          &errstr,
 #else
@@ -486,7 +497,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   if (err != GA_NO_ERROR)
     return err;
 
-  err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]);
+  err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
   if (err != GA_NO_ERROR)
     goto out;
 
@@ -498,27 +509,33 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
     pl = ls[0];
     ls[0] = ls[1];
     ls[1] = pl;
+    gs[0] = 1;
+  } else {
+    gs[0] = gs[1];
+    gs[1] = 1;
   }
-  gs[0] = 1;
 
   argp = 0;
   GpuKernel_setarg(&k, argp++, a->data);
+  GpuKernel_setarg(&k, argp++, (void *)&a->offset);
   GpuKernel_setarg(&k, argp++, v->data);
+  /* The cast is to avoid a warning about const */
   GpuKernel_setarg(&k, argp++, (void *)&v->offset);
   for (j = 0; j < v->nd; j++) {
     GpuKernel_setarg(&k, argp++, &v->strides[j]);
     GpuKernel_setarg(&k, argp++, &v->dimensions[j]);
   }
   GpuKernel_setarg(&k, argp++, i->data);
+  GpuKernel_setarg(&k, argp++, (void *)&i->offset);
   GpuKernel_setarg(&k, argp++, &n[0]);
   GpuKernel_setarg(&k, argp++, &n[1]);
   GpuKernel_setarg(&k, argp++, errbuf);
 
-  err = GpuKernel_call(&k, 2, ls, gs, 0, NULL);
+  err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
   if (check_error && err == GA_NO_ERROR) {
     err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
     if (err == GA_NO_ERROR && kerr != 0) {
-      err = GA_VALUE_ERROR;
+      err = error_set(ctx->err, GA_VALUE_ERROR, "Index out of bounds");
       kerr = 0;
       /* We suppose this will not fail */
       gpudata_write(errbuf, 0, &kerr, sizeof(int));
@@ -531,6 +548,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
 }
 
 int GpuArray_setarray(GpuArray *a, const GpuArray *v) {
+  gpucontext *ctx = GpuArray_context(a);
   GpuArray tv;
   size_t sz;
   ssize_t *strs;
@@ -539,21 +557,24 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) {
   int simple_move = 1;
 
   if (a->nd < v->nd)
-    return GA_VALUE_ERROR;
+    return error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension error. "
+                     "a->nd = %llu, v->nd = %llu", a->nd, v->nd);
 
   if (!GpuArray_ISWRITEABLE(a))
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Destination array not writable");
   if (!GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(a))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "One of the inputs is unaligned");
 
   off = a->nd - v->nd;
 
   for (i = 0; i < v->nd; i++) {
     if (v->dimensions[i] != a->dimensions[i+off]) {
       if (v->dimensions[i] != 1)
-	return GA_VALUE_ERROR;
+        return error_fmt(ctx->err, GA_VALUE_ERROR, "Shape error. "
+                         "v->dimensions[%u] = %llu, a->dimesions[%u + %u] = %llu",
+                         i, v->dimensions[i], i, off, a->dimensions[i + off]);
       else
-	simple_move = 0;
+        simple_move = 0;
     }
   }
 
@@ -568,7 +589,7 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) {
 
   strs = calloc(a->nd, sizeof(ssize_t));
   if (strs == NULL)
-    return GA_MEMORY_ERROR;
+    return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory");
 
   for (i = off; i < a->nd; i++) {
     if (v->dimensions[i-off] == a->dimensions[i]) {
@@ -580,9 +601,8 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) {
   tv.nd = a->nd;
   tv.dimensions = a->dimensions;
   tv.strides = strs;
-  /* This could be optiomized by setting the right flags */
   if (tv.nd != 0)
-    tv.flags &= ~(GA_C_CONTIGUOUS|GA_F_CONTIGUOUS);
+    GpuArray_fix_flags(&tv);
   err = ga_extcopy(a, &tv);
   free(strs);
   return err;
@@ -596,7 +616,8 @@ int GpuArray_reshape(GpuArray *res, const GpuArray *a, unsigned int nd,
   err = GpuArray_reshape_inplace(res, nd, newdims, ord);
   if (err == GA_COPY_ERROR && !nocopy) {
     GpuArray_clear(res);
-    GpuArray_copy(res, a, ord);
+    err = GpuArray_copy(res, a, ord);
+    if (err != GA_NO_ERROR) return err;
     err = GpuArray_reshape_inplace(res, nd, newdims, ord);
   }
   if (err != GA_NO_ERROR) GpuArray_clear(res);
@@ -605,6 +626,7 @@ int GpuArray_reshape(GpuArray *res, const GpuArray *a, unsigned int nd,
 
 int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
                              const size_t *newdims, ga_order ord) {
+  gpucontext *ctx = GpuArray_context(a);
   ssize_t *newstrides;
   size_t *tmpdims;
   size_t np;
@@ -630,12 +652,12 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
     size_t d = newdims[i];
     /* Check for overflow */
     if ((d >= MUL_NO_OVERFLOW || newsize >= MUL_NO_OVERFLOW) &&
-	d > 0 && SIZE_MAX / d < newsize)
-      return GA_INVALID_ERROR;
+        d > 0 && SIZE_MAX / d < newsize)
+      return error_set(ctx->err, GA_XLARGE_ERROR, "Output array size greater than addressable space");
     newsize *= d;
   }
 
-  if (newsize != oldsize) return GA_INVALID_ERROR;
+  if (newsize != oldsize) return error_set(ctx->err, GA_INVALID_ERROR, "New shape differs in total size");
 
   /* If the source and desired layouts are the same, then just copy
      strides and dimensions */
@@ -646,43 +668,45 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
 
   newstrides = calloc(nd, sizeof(ssize_t));
   if (newstrides == NULL)
-    return GA_MEMORY_ERROR;
-
-  while (ni < nd && oi < a->nd) {
-    np = newdims[ni];
-    op = a->dimensions[oi];
+    return error_sys(ctx->err, "calloc");
+
+  if (newsize != 0) {
+    while (ni < nd && oi < a->nd) {
+      np = newdims[ni];
+      op = a->dimensions[oi];
+
+      while (np != op) {
+        if (np < op) {
+          np *= newdims[nj++];
+        } else {
+          op *= a->dimensions[oj++];
+        }
+      }
 
-    while (np != op) {
-      if (np < op) {
-        np *= newdims[nj++];
-      } else {
-        op *= a->dimensions[oj++];
+      for (ok = oi; ok < oj - 1; ok++) {
+        if (ord == GA_F_ORDER) {
+          if (a->strides[ok+1] != (ssize_t)a->dimensions[ok]*a->strides[ok])
+            goto need_copy;
+        } else {
+          if (a->strides[ok] != (ssize_t)a->dimensions[ok+1]*a->strides[ok+1])
+            goto need_copy;
+        }
       }
-    }
 
-    for (ok = oi; ok < oj - 1; ok++) {
       if (ord == GA_F_ORDER) {
-        if (a->strides[ok+1] != a->dimensions[ok]*a->strides[ok])
-          goto need_copy;
+        newstrides[ni] = a->strides[oi];
+        for (nk = ni + 1; nk < nj; nk++) {
+          newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
+        }
       } else {
-        if (a->strides[ok] != a->dimensions[ok+1]*a->strides[ok+1])
-          goto need_copy;
-      }
-    }
-
-    if (ord == GA_F_ORDER) {
-      newstrides[ni] = a->strides[oi];
-      for (nk = ni + 1; nk < nj; nk++) {
-        newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
-      }
-    } else {
-      newstrides[nj-1] = a->strides[oj-1];
-      for (nk = nj-1; nk > ni; nk--) {
-        newstrides[nk-1] = newstrides[nk]*newdims[nk];
+        newstrides[nj-1] = a->strides[oj-1];
+        for (nk = nj-1; nk > ni; nk--) {
+          newstrides[nk-1] = newstrides[nk]*newdims[nk];
+        }
       }
+      ni = nj++;
+      oi = oj++;
     }
-    ni = nj++;
-    oi = oj++;
   }
 
   /* Fixup trailing ones */
@@ -700,7 +724,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
      Can't do the same with newdims (which is a parameter). */
   tmpdims = calloc(nd, sizeof(size_t));
   if (tmpdims == NULL) {
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   }
   memcpy(tmpdims, newdims, nd*sizeof(size_t));
   a->nd = nd;
@@ -712,7 +736,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
   goto fix_flags;
  need_copy:
   free(newstrides);
-  return GA_COPY_ERROR;
+  return error_set(ctx->err, GA_COPY_ERROR, "Copy is needed but disallowed by parameters");
 
  do_final_copy:
   tmpdims = calloc(nd, sizeof(size_t));
@@ -720,7 +744,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
   if (tmpdims == NULL || newstrides == NULL) {
     free(tmpdims);
     free(newstrides);
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   }
   memcpy(tmpdims, newdims, nd*sizeof(size_t));
   if (nd > 0) {
@@ -743,18 +767,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd,
   a->strides = newstrides;
 
  fix_flags:
-  if (GpuArray_is_c_contiguous(a))
-    a->flags |= GA_C_CONTIGUOUS;
-  else
-    a->flags &= ~GA_C_CONTIGUOUS;
-  if (GpuArray_is_f_contiguous(a))
-    a->flags |= GA_F_CONTIGUOUS;
-  else
-    a->flags &= ~GA_F_CONTIGUOUS;
-  if (GpuArray_is_aligned(a))
-    a->flags |= GA_ALIGNED;
-  else
-    a->flags &= ~GA_ALIGNED;
+  GpuArray_fix_flags(a);
   return GA_NO_ERROR;
 }
 
@@ -770,6 +783,7 @@ int GpuArray_transpose(GpuArray *res, const GpuArray *a,
 }
 
 int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) {
+  gpucontext *ctx = GpuArray_context(a);
   size_t *newdims;
   ssize_t *newstrs;
   unsigned int i;
@@ -781,7 +795,7 @@ int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) {
   if (newdims == NULL || newstrs == NULL) {
     free(newdims);
     free(newstrs);
-    return GA_MEMORY_ERROR;
+    return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory");
   }
 
   for (i = 0; i < a->nd; i++) {
@@ -794,7 +808,9 @@ int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) {
         if (j == new_axes[k]) {
           free(newdims);
           free(newstrs);
-          return GA_VALUE_ERROR;
+          return error_fmt(ctx->err, GA_VALUE_ERROR,
+                           "Repeated axes in transpose: new_axes[%u] == new_axes[%u] == %u",
+                           i, k, j);
         }
     }
     newdims[i] = a->dimensions[j];
@@ -806,11 +822,7 @@ int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) {
   a->dimensions = newdims;
   a->strides = newstrs;
 
-  a->flags &= ~(GA_C_CONTIGUOUS|GA_F_CONTIGUOUS);
-  if (GpuArray_is_c_contiguous(a))
-    a->flags |= GA_C_CONTIGUOUS;
-  if (GpuArray_is_f_contiguous(a))
-    a->flags |= GA_F_CONTIGUOUS;
+  GpuArray_fix_flags(a);
 
   return GA_NO_ERROR;
 }
@@ -835,17 +847,24 @@ gpucontext *GpuArray_context(const GpuArray *a) {
 }
 
 int GpuArray_move(GpuArray *dst, const GpuArray *src) {
+  gpucontext *ctx = GpuArray_context(dst);
   size_t sz;
   unsigned int i;
   if (!GpuArray_ISWRITEABLE(dst))
-    return GA_VALUE_ERROR;
-  if (!GpuArray_ISALIGNED(src) || !GpuArray_ISALIGNED(dst))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Destination array (dst) not writeable");
+  if (!GpuArray_ISALIGNED(src))
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Source array (src) not aligned");
+  if (!GpuArray_ISALIGNED(dst))
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Destination array (dst) not aligned");
   if (src->nd != dst->nd)
-    return GA_VALUE_ERROR;
+    return error_fmt(ctx->err, GA_VALUE_ERROR,
+                     "Dimension mismatch. src->nd = %llu, dst->nd = %llu",
+                     src->nd, dst->nd);
   for (i = 0; i < src->nd; i++) {
     if (src->dimensions[i] != dst->dimensions[i])
-      return GA_VALUE_ERROR;
+      return error_fmt(ctx->err, GA_VALUE_ERROR,
+                       "Dimension mismatch. src->dimensions[%u] = %llu, dst->dimensions[%u] = %llu",
+                       i, src->dimensions[i], i, dst->dimensions[i]);
   }
   if (!GpuArray_ISONESEGMENT(dst) || !GpuArray_ISONESEGMENT(src) ||
       GpuArray_ISFORTRAN(dst) != GpuArray_ISFORTRAN(src) ||
@@ -858,22 +877,25 @@ int GpuArray_move(GpuArray *dst, const GpuArray *src) {
 }
 
 int GpuArray_write(GpuArray *dst, const void *src, size_t src_sz) {
+  gpucontext *ctx = GpuArray_context(dst);
   if (!GpuArray_ISWRITEABLE(dst))
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Destination array (dst) not writeable");
   if (!GpuArray_ISONESEGMENT(dst))
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Destination array (dst) not one segment");
   return gpudata_write(dst->data, dst->offset, src, src_sz);
 }
 
 int GpuArray_read(void *dst, size_t dst_sz, const GpuArray *src) {
+  gpucontext *ctx = GpuArray_context(src);
   if (!GpuArray_ISONESEGMENT(src))
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (src) not one segment");
   return gpudata_read(dst, src->data, src->offset, dst_sz);
 }
 
 int GpuArray_memset(GpuArray *a, int data) {
+  gpucontext *ctx = GpuArray_context(a);
   if (!GpuArray_ISONESEGMENT(a))
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (a) not one segment");
   return gpudata_memset(a->data, a->offset, data);
 }
 
@@ -889,16 +911,17 @@ int GpuArray_copy(GpuArray *res, const GpuArray *a, ga_order order) {
 }
 
 int GpuArray_transfer(GpuArray *res, const GpuArray *a) {
+  gpucontext *ctx = GpuArray_context(res);
   size_t sz;
   unsigned int i;
 
   if (!GpuArray_ISONESEGMENT(res))
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (res) not one segment");
   if (!GpuArray_ISONESEGMENT(a))
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (a) not one segment");
 
   if (res->typecode != a->typecode)
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "typecode mismatch");
 
   sz = gpuarray_get_elsize(a->typecode);
   for (i = 0; i < a->nd; i++) sz *= a->dimensions[i];
@@ -908,6 +931,7 @@ int GpuArray_transfer(GpuArray *res, const GpuArray *a) {
 
 int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p,
                    unsigned int axis) {
+  gpucontext *ctx = GpuArray_context(a);
   size_t i;
   ssize_t *starts, *stops, *steps;
   int err;
@@ -920,7 +944,7 @@ int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p,
     free(starts);
     free(stops);
     free(steps);
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   }
 
   for (i = 0; i < a->nd; i++) {
@@ -957,6 +981,7 @@ int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p,
 
 int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n,
                          unsigned int axis, int restype) {
+  gpucontext *ctx = GpuArray_context(as[0]);
   size_t *dims, *res_dims;
   size_t i, res_off;
   unsigned int p;
@@ -964,33 +989,38 @@ int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n,
   int err = GA_NO_ERROR;
 
   if (axis >= as[0]->nd)
-    return GA_VALUE_ERROR;
+    return error_fmt(ctx->err, GA_VALUE_ERROR, "Invalid axis. "
+                     "axis = %u, as[0]->nd = %llu", axis, as[0]->nd);
 
   dims = calloc(as[0]->nd, sizeof(size_t));
   if (dims == NULL)
-    return GA_MEMORY_ERROR;
+    return error_fmt(ctx->err, GA_MEMORY_ERROR, "Out of memory");
 
   for (p = 0; p < as[0]->nd; p++) {
     dims[p] = as[0]->dimensions[p];
   }
 
   if (!GpuArray_ISALIGNED(as[0])) {
-    err = GA_UNALIGNED_ERROR;
+    err = error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned array (as[0]).");
     goto afterloop;
   }
 
   for (i = 1; i < n; i++) {
     if (!GpuArray_ISALIGNED(as[i])) {
-      err = GA_UNALIGNED_ERROR;
+      err = error_fmt(ctx->err, GA_UNALIGNED_ERROR, "Unaligned array (as[%llu]).", i);
       goto afterloop;
     }
     if (as[i]->nd != as[0]->nd) {
-      err = GA_VALUE_ERROR;
+      err = error_fmt(ctx->err, GA_VALUE_ERROR, "Shape mismatch. "
+                      "as[%llu]->nd = %llu, as[0]->nd = %llu",
+                      i, as[i]->nd, as[0]->nd);
       goto afterloop;
     }
     for (p = 0; p < as[0]->nd; p++) {
       if (p != axis && dims[p] != as[i]->dimensions[p]) {
-        err = GA_VALUE_ERROR;
+        err = error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension mismatch. "
+                        "as[%llu]->dimensions[%u] = %llu, as[0]->dimensions[%u] = %llu",
+                        i, p, as[i]->dimensions[p], p, dims[p]);
         goto afterloop;
       } else if (p == axis) {
         dims[p] += as[i]->dimensions[p];
@@ -1014,10 +1044,9 @@ int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n,
   res_off = r->offset;
   res_dims = r->dimensions;
   res_flags = r->flags;
-  /* This could be optimized by setting the right flags */
-  r->flags &= ~(GA_C_CONTIGUOUS|GA_F_CONTIGUOUS);
   for (i = 0; i < n; i++) {
     r->dimensions = as[i]->dimensions;
+    GpuArray_fix_flags(r);
     err = ga_extcopy(r, as[i]);
     if (err != GA_NO_ERROR) {
       r->dimensions = res_dims;
@@ -1076,6 +1105,7 @@ void GpuArray_fprintf(FILE *fd, const GpuArray *a) {
 }
 
 int GpuArray_fdump(FILE *fd, const GpuArray *a) {
+  gpucontext *ctx = GpuArray_context(a);
   char *buf, *p;
   size_t s = GpuArray_ITEMSIZE(a);
   size_t k;
@@ -1087,7 +1117,7 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) {
 
   buf = malloc(s);
   if (buf == NULL)
-    return GA_MEMORY_ERROR;
+    return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory");
 
   err = GpuArray_read(buf, s, a);
   if (err != GA_NO_ERROR) {
@@ -1103,12 +1133,19 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) {
     case GA_UINT:
       fprintf(fd, "%u", *(unsigned int *)p);
       break;
+    case GA_LONG:
+      fprintf(fd, "%lld", (long long)*(int64_t *)p);
+      break;
+    case GA_FLOAT:
+      fprintf(fd, "%f", *(float *)p);
+      break;
     case GA_SSIZE:
       fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p);
       break;
     default:
       free(buf);
-      return GA_UNSUPPORTED_ERROR;
+      fprintf(fd, "<unsupported data type %d>\n", a->typecode);
+      return error_fmt(ctx->err, GA_UNSUPPORTED_ERROR, "Unsupported data type for dump: %d", a->typecode);
     }
     s -= gpuarray_get_elsize(a->typecode);
     p += gpuarray_get_elsize(a->typecode);
@@ -1124,7 +1161,7 @@ int GpuArray_is_c_contiguous(const GpuArray *a) {
   int i;
 
   for (i = a->nd - 1; i >= 0; i--) {
-    if (a->strides[i] != size) return 0;
+    if (a->strides[i] != (ssize_t)size && a->dimensions[i] != 1) return 0;
     // We suppose that overflow will not happen since data has to fit in memory
     size *= a->dimensions[i];
   }
@@ -1136,7 +1173,7 @@ int GpuArray_is_f_contiguous(const GpuArray *a) {
   unsigned int i;
 
   for (i = 0; i < a->nd; i++) {
-    if (a->strides[i] != size) return 0;
+    if (a->strides[i] != (ssize_t)size && a->dimensions[i] != 1) return 0;
     // We suppose that overflow will not happen since data has to fit in memory
     size *= a->dimensions[i];
   }
diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c
index b8f0909be5..36b5e66d2b 100644
--- a/src/gpuarray_array_blas.c
+++ b/src/gpuarray_array_blas.c
@@ -3,7 +3,98 @@
 #include "gpuarray/buffer_blas.h"
 #include "gpuarray/types.h"
 #include "gpuarray/util.h"
-#include "gpuarray/error.h"
+
+#include "private.h"
+#include "util/error.h"
+
+int GpuArray_rdot(GpuArray *X, GpuArray *Y,
+                  GpuArray *Z, int nocopy) {
+    GpuArray *Xp = X;
+    GpuArray copyX;
+    GpuArray *Yp = Y;
+    GpuArray copyY;
+    GpuArray *Zp = Z;
+    size_t n;
+    gpucontext *ctx = gpudata_context(Xp->data);
+    size_t elsize;
+    int err;
+
+  if (X->typecode != GA_HALF &&
+      X->typecode != GA_FLOAT &&
+      X->typecode != GA_DOUBLE)
+  return error_set(ctx->err, GA_INVALID_ERROR, "Data type not supported");
+
+  if (X->nd != 1 || Y->nd != 1 || Z->nd != 0)
+    return error_fmt(ctx->err, GA_VALUE_ERROR,
+                     "Wrong number of dimensions: X->nd = %u (expected 1), Y->nd = %u (expected 1), Z->nd = %u (expected 0)",
+                     X->nd, Y->nd, Z->nd);
+  if (X->typecode != Y->typecode || X->typecode != Z->typecode)
+    error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes");
+  n = X->dimensions[0];
+  if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) ||
+      !(Z->flags & GA_ALIGNED))
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "One of the inputs is unaligned");
+  if (X->dimensions[0] != Y->dimensions[0])
+      return error_fmt(ctx->err, GA_VALUE_ERROR,
+                       "Shape mismatch: X->dimensions[0] = %d != Y->dimensions[0] = %d",
+                       X->dimensions[0], Y->dimensions[0]);
+
+  elsize = gpuarray_get_elsize(X->typecode);
+  if (X->strides[0] < 0) {
+    if (nocopy)
+      return error_set(ctx->err, GA_COPY_ERROR, "Copy required for X");
+    else {
+      err = GpuArray_copy(&copyX, X, GA_ANY_ORDER);
+      if (err != GA_NO_ERROR)
+        goto cleanup;
+      Xp = &copyX;
+    }
+  }
+  if (Y->strides[0] < 0) {
+    if (nocopy)
+      return error_set(ctx->err, GA_COPY_ERROR, "Copy required for Y");
+    else {
+      err = GpuArray_copy(&copyY, Y, GA_ANY_ORDER);
+      if (err != GA_NO_ERROR)
+        goto cleanup;
+      Yp = &copyY;
+    }
+  }
+
+  err = gpublas_setup(ctx);
+  if (err != GA_NO_ERROR)
+      goto cleanup;
+
+  switch (Xp->typecode) {
+      case GA_HALF:
+          err = gpublas_hdot(
+                  n,
+                  Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize,
+                  Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize,
+                  Zp->data, Zp->offset / elsize);
+          break;
+      case GA_FLOAT:
+          err = gpublas_sdot(
+                  n,
+                  Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize,
+                  Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize,
+                  Zp->data, Zp->offset / elsize);
+          break;
+      case GA_DOUBLE:
+          err = gpublas_ddot(
+                  n,
+                  Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize,
+                  Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize,
+                  Zp->data, Zp->offset / elsize);
+          break;
+  }
+  cleanup:
+   if (Xp == &copyX)
+       GpuArray_clear(&copyX);
+   if (Yp == &copyY)
+       GpuArray_clear(&copyY);
+   return err;
+}
 
 int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A,
                    GpuArray *X, double beta, GpuArray *Y, int nocopy) {
@@ -12,7 +103,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A,
   GpuArray *Xp = X;
   GpuArray copyX;
   GpuArray *Yp = Y;
-  void *ctx;
+  gpucontext *ctx = gpudata_context(Ap->data);
   size_t elsize;
   size_t m, n, lda;
   cb_order o;
@@ -21,16 +112,18 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A,
   if (A->typecode != GA_HALF &&
       A->typecode != GA_FLOAT &&
       A->typecode != GA_DOUBLE)
-    return GA_INVALID_ERROR;
+    return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype");
 
-  if (A->nd != 2 || X->nd != 1 || Y->nd != 1 ||
-      A->typecode != A->typecode || X->typecode != A->typecode ||
-      Y->typecode != A->typecode)
-    return GA_VALUE_ERROR;
+  if (A->nd != 2 || X->nd != 1 || Y->nd != 1)
+    return error_fmt(ctx->err, GA_VALUE_ERROR,
+                     "Wrong number of dimensions: A->nd = %u (expected 2), X->nd = %u (expected 1), Y->nd = %u (expected 1)",
+                     A->nd, X->nd, Y->nd);
+  if (X->typecode != A->typecode || Y->typecode != A->typecode)
+    return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes");
 
   if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) ||
       !(Y->flags & GA_ALIGNED))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs");
 
   if (transA == cb_no_trans) {
     m = A->dimensions[0];
@@ -41,7 +134,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A,
   }
 
   if (Y->dimensions[0] != m || X->dimensions[0] != n)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent shapes");
 
   m = A->dimensions[0];
   n = A->dimensions[1];
@@ -50,26 +143,26 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A,
 
   if (!GpuArray_ISONESEGMENT(A)) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Copy required for A");
     else {
       err = GpuArray_copy(&copyA, A, GA_F_ORDER);
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Ap = &copyA;
     }
   }
   if (X->strides[0] < 0) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Copy required for X");
     else {
       err = GpuArray_copy(&copyX, X, GA_ANY_ORDER);
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Xp = &copyX;
     }
   }
   if (Y->strides[0] < 0) {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Negative strides for Y");
     goto cleanup;
   }
 
@@ -81,11 +174,10 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A,
     lda = Ap->dimensions[1];
   } else {
     /* Might be worth looking at making degenerate matrices (1xn) work here. */
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A");
     goto cleanup;
   }
 
-  ctx = gpudata_context(Ap->data);
   err = gpublas_setup(ctx);
   if (err != GA_NO_ERROR)
     goto cleanup;
@@ -117,7 +209,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha,
   GpuArray *Bp = B;
   GpuArray copyB;
   GpuArray *Cp = C;
-  void *ctx;
+  gpucontext *ctx = gpudata_context(Ap->data);
   size_t elsize;
   size_t m, n, k, lda, ldb, ldc;
   cb_order o;
@@ -125,16 +217,18 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha,
 
   if (A->typecode != GA_HALF && A->typecode != GA_FLOAT &&
       A->typecode != GA_DOUBLE)
-    return GA_INVALID_ERROR;
+    return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype");
 
-  if (A->nd != 2 || B->nd != 2 || C->nd != 2 ||
-      A->typecode != A->typecode || B->typecode != A->typecode ||
-      C->typecode != A->typecode)
-    return GA_VALUE_ERROR;
+  if (A->nd != 2 || B->nd != 2 || C->nd != 2)
+    return error_fmt(ctx->err, GA_VALUE_ERROR,
+                     "Wrong number of dimensions: A->nd = %u (expected 2), B->nd = %u (expected 2), C->nd = %u (expected 2)",
+                     A->nd, B->nd, C->nd);
+  if (B->typecode != A->typecode || C->typecode != A->typecode)
+    return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes");
 
   if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) ||
       !(C->flags & GA_ALIGNED))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs");
 
   if (transA == cb_no_trans) {
     m = A->dimensions[0];
@@ -147,40 +241,40 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha,
   if (transB == cb_no_trans) {
     n = B->dimensions[1];
     if (B->dimensions[0] != k)
-      return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes");
   } else {
     n = B->dimensions[0];
     if (B->dimensions[1] != k)
-      return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes");
   }
 
   if (C->dimensions[0] != m || C->dimensions[1] != n)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes");
 
   elsize = gpuarray_get_elsize(A->typecode);
 
   if (!GpuArray_ISONESEGMENT(A)) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Need copy for A");
     else {
       err = GpuArray_copy(&copyA, A, GA_F_ORDER);
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Ap = &copyA;
     }
   }
   if (!GpuArray_ISONESEGMENT(B)) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Need copy for B");
     else {
       err = GpuArray_copy(&copyB, B, GA_F_ORDER);
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Bp = &copyB;
     }
   }
   if (!GpuArray_ISONESEGMENT(C)) {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous C");
     goto cleanup;
   }
 
@@ -191,7 +285,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha,
     o = cb_c;
     ldc = Cp->dimensions[1];
   } else {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous C");
     goto cleanup;
   }
   if (Ap->flags & GA_F_CONTIGUOUS) {
@@ -211,7 +305,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha,
         transA = cb_no_trans;
     }
   } else {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A");
     goto cleanup;
   }
   if (Bp->flags & GA_F_CONTIGUOUS) {
@@ -231,7 +325,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha,
         transB = cb_no_trans;
     }
   } else {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous B");
     goto cleanup;
   }
 
@@ -267,7 +361,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A,
   GpuArray *Yp = Y;
   GpuArray copyY;
   GpuArray *Ap = A;
-  void *ctx;
+  gpucontext *ctx = gpudata_context(Xp->data);
   size_t elsize;
   size_t m, n, lda;
   cb_order o;
@@ -275,46 +369,48 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A,
 
   if (X->typecode != GA_HALF && X->typecode != GA_FLOAT &&
       X->typecode != GA_DOUBLE)
-    return GA_INVALID_ERROR;
+    return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype");
 
-  if (X->nd != 1 || Y->nd != 1 || A->nd != 2 ||
-      X->typecode != X->typecode || Y->typecode != X->typecode ||
-      A->typecode != X->typecode)
-    return GA_VALUE_ERROR;
+  if (X->nd != 1 || Y->nd != 1 || A->nd != 2)
+    return error_fmt(ctx->err, GA_VALUE_ERROR,
+                     "Wrong number of dimensions: X->nd = %u (expected 1), Y->nd = %u (expected 1), A->nd = %u (expected 2)",
+                     X->nd, Y->nd, A->nd);
+  if (Y->typecode != X->typecode || A->typecode != X->typecode)
+    return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes");
 
   if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) ||
       !(A->flags & GA_ALIGNED))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs");
 
   m = X->dimensions[0];
   n = Y->dimensions[0];
   if (A->dimensions[0] != m || A->dimensions[1] != n)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Incompatible shapes");
 
   elsize = gpuarray_get_elsize(X->typecode);
 
   if (X->strides[0] < 0) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Need copy for X");
     else {
       err = GpuArray_copy(&copyX, X, GA_ANY_ORDER);
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Xp = &copyX;
     }
   }
   if (Y->strides[0] < 0) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Need copy for Y");
     else {
       err = GpuArray_copy(&copyY, Y, GA_ANY_ORDER);
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Yp = &copyY;
     }
   }
   if (!GpuArray_ISONESEGMENT(A)) {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A");
     goto cleanup;
   }
 
@@ -326,7 +422,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A,
     lda = Ap->dimensions[1];
   } else {
     /* Might be worth looking at making degenerate matrices (1xn) work here. */
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A");
     goto cleanup;
   }
 
@@ -355,6 +451,23 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A,
   return err;
 }
 
+static inline int is_last_2d_contiguous(const GpuArray *a) {
+  ssize_t size = GpuArray_ITEMSIZE(a);
+
+  if (GpuArray_IS_C_CONTIGUOUS(a))
+    return 1; // C contiguous
+
+  if (a->strides[a->nd - 2] <= 0 || a->strides[a->nd - 1] <= 0)
+    return 0;
+
+  if (a->strides[a->nd - 2] == size)
+    return 2; // F contiguous
+  if (a->strides[a->nd - 1] == size)
+    return 1; // C contiguous
+
+  return 0;
+}
+
 int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alpha,
                            GpuArray *A, GpuArray *B, double beta, GpuArray *C,
                            int nocopy) {
@@ -363,30 +476,30 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph
   GpuArray *Bp = B;
   GpuArray copyB;
   GpuArray *Cp = C;
-  void *ctx;
+  gpucontext *ctx = gpudata_context(A->data);
   size_t elsize;
   size_t batchCount, m, n, k, lda, ldb, ldc;
   cb_order o;
+  int cA, cB, cC;
   int err;
-  gpudata **A_datas = NULL, **B_datas = NULL, **C_datas = NULL;
-  size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL;
-  int i;
 
-  if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE)
-    return GA_INVALID_ERROR;
+  if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE && A->typecode != GA_HALF)
+    return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype");
 
-  if (A->nd != 3 || B->nd != 3 || C->nd != 3 ||
-      A->typecode != A->typecode || B->typecode != A->typecode ||
-      C->typecode != A->typecode)
-    return GA_VALUE_ERROR;
+  if (A->nd != 3 || B->nd != 3 || C->nd != 3)
+    return error_fmt(ctx->err, GA_VALUE_ERROR,
+                     "Wrong number of dimensions: A->nd = %u (expected 3), B->nd = %u (expected 3), C->nd = %u (expected 3)",
+                     A->nd, B->nd, C->nd);
+  if (B->typecode != A->typecode || C->typecode != A->typecode)
+    return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes");
 
   if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) ||
       !(C->flags & GA_ALIGNED))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input");
 
   batchCount = A->dimensions[0];
   if (B->dimensions[0] != batchCount || C->dimensions[0] != batchCount)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched first dimension");
 
   if (transA == cb_no_trans) {
     m = A->dimensions[1];
@@ -399,64 +512,76 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph
   if (transB == cb_no_trans) {
     n = B->dimensions[2];
     if (B->dimensions[1] != k)
-      return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape");
   } else {
     n = B->dimensions[1];
     if (B->dimensions[2] != k)
-      return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape");
   }
 
   if (C->dimensions[1] != m || C->dimensions[2] != n)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape");
 
   elsize = gpuarray_get_elsize(A->typecode);
 
-  // FIXME: these conditions are overly restrictive; the first axis need not be contiguous
-  if (!GpuArray_ISONESEGMENT(A)) {
+  cA = is_last_2d_contiguous(A);
+  if (!cA) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Need copy for A");
     else {
-      err = GpuArray_copy(&copyA, A, GA_F_ORDER);
+      err = GpuArray_copy(&copyA, A, GA_C_ORDER);
+      cA = 1;
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Ap = &copyA;
     }
   }
-  if (!GpuArray_ISONESEGMENT(B)) {
+  cB = is_last_2d_contiguous(B);
+  if (!cB) {
     if (nocopy)
-      return GA_COPY_ERROR;
+      return error_set(ctx->err, GA_COPY_ERROR, "Need copy for B");
     else {
-      err = GpuArray_copy(&copyB, B, GA_F_ORDER);
+      err = GpuArray_copy(&copyB, B, GA_C_ORDER);
+      cB = 1;
       if (err != GA_NO_ERROR)
-	goto cleanup;
+        goto cleanup;
       Bp = &copyB;
     }
   }
-  if (!GpuArray_ISONESEGMENT(C)) {
-    err = GA_VALUE_ERROR;
+  cC = is_last_2d_contiguous(C);
+  if (!cC) {
+    err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous last 2d C");
     goto cleanup;
   }
 
-  if (Cp->flags & GA_F_CONTIGUOUS) {
+  if (cC == 2) {
     o = cb_fortran;
-    ldc = Cp->dimensions[1];
-  } else if (Cp->flags & GA_C_CONTIGUOUS) {
+    ldc = Cp->dimensions[2] > 1
+          ? Cp->strides[2] / elsize
+          : Cp->dimensions[1];
+  } else if (cC == 1) {
     o = cb_c;
-    ldc = Cp->dimensions[2];
+    ldc = Cp->dimensions[1] > 1
+          ? Cp->strides[1] / elsize
+          : Cp->dimensions[2];
   } else {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for C");
     goto cleanup;
   }
-  if (Ap->flags & GA_F_CONTIGUOUS) {
-    lda = Ap->dimensions[1];
+  if (cA == 2) {
+    lda = Ap->dimensions[2] > 1
+          ? Ap->strides[2] / elsize
+          : Ap->dimensions[1];
     if (o == cb_c) {
       if (transA == cb_no_trans)
         transA = cb_trans;
       else
         transA = cb_no_trans;
     }
-  } else if (Ap->flags & GA_C_CONTIGUOUS) {
-    lda = Ap->dimensions[2];
+  } else if (cA == 1) {
+    lda = Ap->dimensions[1] > 1
+          ? Ap->strides[1] / elsize
+          : Ap->dimensions[2];
     if (o == cb_fortran) {
       if (transA == cb_no_trans)
         transA = cb_trans;
@@ -464,19 +589,23 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph
         transA = cb_no_trans;
     }
   } else {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for A");
     goto cleanup;
   }
-  if (Bp->flags & GA_F_CONTIGUOUS) {
-    ldb = Bp->dimensions[1];
+  if (cB == 2) {
+    ldb = Bp->dimensions[2] > 1
+          ? Bp->strides[2] / elsize
+          : Bp->dimensions[1];
     if (o == cb_c) {
       if (transB == cb_no_trans)
         transB = cb_trans;
       else
         transB = cb_no_trans;
     }
-  } else if (Bp->flags & GA_C_CONTIGUOUS) {
-    ldb = Bp->dimensions[2];
+  } else if (cB == 1) {
+    ldb = Bp->dimensions[1] > 1
+          ? Bp->strides[1] / elsize
+          : Bp->dimensions[2];
     if (o == cb_fortran) {
       if (transB == cb_no_trans)
         transB = cb_trans;
@@ -484,7 +613,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph
         transB = cb_no_trans;
     }
   } else {
-    err = GA_VALUE_ERROR;
+    err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for B");
     goto cleanup;
   }
 
@@ -493,50 +622,90 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph
   if (err != GA_NO_ERROR)
     goto cleanup;
 
-  A_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*));
-  B_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*));
-  C_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*));
-
-  A_offsets = (size_t*)malloc(batchCount * sizeof(size_t));
-  B_offsets = (size_t*)malloc(batchCount * sizeof(size_t));
-  C_offsets = (size_t*)malloc(batchCount * sizeof(size_t));
-
-  for (i = 0; i < batchCount; i++) {
-    A_datas[i] = Ap->data;
-    B_datas[i] = Bp->data;
-    C_datas[i] = Cp->data;
-    A_offsets[i] = (Ap->offset + i * Ap->strides[0]) / elsize;
-    B_offsets[i] = (Bp->offset + i * Bp->strides[0]) / elsize;
-    C_offsets[i] = (Cp->offset + i * Cp->strides[0]) / elsize;
-  }
-
   switch (C->typecode) {
   case GA_HALF:
-    err = gpublas_hgemmBatch(o, transA, transB, m, n, k, (float)alpha,
-                             A_datas, A_offsets, lda,
-                             B_datas, B_offsets, ldb,
-                             (float)beta,
-                             C_datas, C_offsets, ldc, batchCount, 0);
+    err = gpublas_hgemm3D(o, transA, transB, m, n, k, (float)alpha,
+                          Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize,
+                          Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize,
+                          (float)beta,
+                          Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize,
+                          batchCount, 0);
     break;
   case GA_FLOAT:
-    err = gpublas_sgemmBatch(o, transA, transB, m, n, k, (float)alpha,
-                             A_datas, A_offsets, lda,
-                             B_datas, B_offsets, ldb,
-                             (float)beta,
-                             C_datas, C_offsets, ldc, batchCount, 0);
+    err = gpublas_sgemm3D(o, transA, transB, m, n, k, (float)alpha,
+                          Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize,
+                          Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize,
+                          (float)beta,
+                          Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize,
+                          batchCount, 0);
     break;
   case GA_DOUBLE:
-    err = gpublas_dgemmBatch(o, transA, transB, m, n, k, (double)alpha,
-                             A_datas, A_offsets, lda,
-                             B_datas, B_offsets, ldb,
-                             (double)beta,
-                             C_datas, C_offsets, ldc, batchCount, 0);
+    err = gpublas_dgemm3D(o, transA, transB, m, n, k, (double)alpha,
+                          Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize,
+                          Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize,
+                          (double)beta,
+                          Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize,
+                          batchCount, 0);
     break;
   }
 
+  if (err == GA_DEVSUP_ERROR) {
+    gpudata **A_datas = NULL, **B_datas = NULL, **C_datas = NULL;
+    size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL;
+    size_t i;
+
+    A_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*));
+    B_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*));
+    C_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*));
+
+    A_offsets = (size_t*)malloc(batchCount * sizeof(size_t));
+    B_offsets = (size_t*)malloc(batchCount * sizeof(size_t));
+    C_offsets = (size_t*)malloc(batchCount * sizeof(size_t));
+
+    if (A_datas == NULL || B_datas == NULL || C_datas == NULL ||
+        A_offsets == NULL || B_offsets == NULL || C_offsets == NULL) {
+      err = error_sys(ctx->err, "malloc");
+      goto old_cleanup;
+    }
+
+    for (i = 0; i < batchCount; i++) {
+      A_datas[i] = Ap->data;
+      B_datas[i] = Bp->data;
+      C_datas[i] = Cp->data;
+      A_offsets[i] = (Ap->offset + i * Ap->strides[0]) / elsize;
+      B_offsets[i] = (Bp->offset + i * Bp->strides[0]) / elsize;
+      C_offsets[i] = (Cp->offset + i * Cp->strides[0]) / elsize;
+    }
+
+    switch (C->typecode) {
+      case GA_HALF:
+        err = gpublas_hgemmBatch(o, transA, transB, m, n, k, (float)alpha,
+                                 A_datas, A_offsets, lda,
+                                 B_datas, B_offsets, ldb,
+                                 (float)beta,
+                                 C_datas, C_offsets, ldc, batchCount, 0);
+        break;
+      case GA_FLOAT:
+        err = gpublas_sgemmBatch(o, transA, transB, m, n, k, (float)alpha,
+                                 A_datas, A_offsets, lda,
+                                 B_datas, B_offsets, ldb,
+                                 (float)beta,
+                                 C_datas, C_offsets, ldc, batchCount, 0);
+        break;
+      case GA_DOUBLE:
+        err = gpublas_dgemmBatch(o, transA, transB, m, n, k, (double)alpha,
+                                 A_datas, A_offsets, lda,
+                                 B_datas, B_offsets, ldb,
+                                 (double)beta,
+                                 C_datas, C_offsets, ldc, batchCount, 0);
+        break;
+    }
+  old_cleanup:
+    free(A_datas); free(B_datas); free(C_datas);
+    free(A_offsets); free(B_offsets); free(C_offsets);
+  }
+
   cleanup:
-  free(A_datas); free(B_datas); free(C_datas);
-  free(A_offsets); free(B_offsets); free(C_offsets);
   if (Ap == &copyA)
     GpuArray_clear(&copyA);
   if (Bp == &copyB)
diff --git a/src/gpuarray_array_collectives.c b/src/gpuarray_array_collectives.c
index 4e177ef6c6..52d243da5a 100644
--- a/src/gpuarray_array_collectives.c
+++ b/src/gpuarray_array_collectives.c
@@ -29,15 +29,18 @@ static inline size_t find_total_elems(const GpuArray* array) {
 static inline int check_gpuarrays(int times_src, const GpuArray* src,
                                   int times_dest, const GpuArray* dest,
                                   size_t* count) {
+  gpucontext *ctx = gpudata_context(src->data);
   size_t count_src, count_dest;
   count_src = find_total_elems(src);
   count_dest = find_total_elems(dest);
   if (times_src * count_src != times_dest * count_dest)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Size mismatch for transfer");
   if (src->typecode != dest->typecode)
-    return GA_VALUE_ERROR;
-  if (!GpuArray_ISALIGNED(src) || !GpuArray_CHKFLAGS(dest, GA_BEHAVED))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Type mismatch");
+  if (!GpuArray_ISALIGNED(src) || !GpuArray_ISALIGNED(dest))
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned arrays");
+  if (!GpuArray_ISWRITEABLE(dest))
+    return error_set(ctx->err, GA_INVALID_ERROR, "Unwritable destination");
 
   if (times_src >= times_dest)
     *count = count_src;
@@ -48,9 +51,10 @@ static inline int check_gpuarrays(int times_src, const GpuArray* src,
 
 int GpuArray_reduce_from(const GpuArray* src, int opcode, int root,
                          gpucomm* comm) {
+  gpucontext *ctx = gpudata_context(src->data);
   size_t total_elems;
   if (!GpuArray_ISALIGNED(src))
-    return GA_UNALIGNED_ERROR;
+    return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input");
   total_elems = find_total_elems(src);
   return gpucomm_reduce(src->data, src->offset, NULL, 0, total_elems,
                         src->typecode, opcode, root, comm);
@@ -89,16 +93,18 @@ int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest, int opcode,
                                 comm);
 }
 
-int GpuArray_broadcast(GpuArray* array, int root, gpucomm* comm) {
-  int rank = 0;
+int GpuArray_broadcast(GpuArray *array, int root, gpucomm *comm) {
+  gpucontext *ctx = gpudata_context(array->data);
   size_t total_elems;
+  int rank = 0;
+
   GA_CHECK(gpucomm_get_rank(comm, &rank));
   if (rank == root) {
     if (!GpuArray_CHKFLAGS(array, GA_BEHAVED))
-      return GA_UNALIGNED_ERROR;
+      return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input");
   } else {
     if (!GpuArray_ISALIGNED(array))
-      return GA_UNALIGNED_ERROR;
+      return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input");
   }
 
   total_elems = find_total_elems(array);
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
index d6a5d55f6e..935f106d2d 100644
--- a/src/gpuarray_blas_cuda_cublas.c
+++ b/src/gpuarray_blas_cuda_cublas.c
@@ -5,7 +5,9 @@
 #include "gpuarray/kernel.h"
 #include "gpuarray/error.h"
 
-#include "cublas_v2.h"
+#include <limits.h>
+
+#include "loaders/libcublas.h"
 
 extern const gpuarray_buffer_ops cuda_ops;
 
@@ -22,6 +24,47 @@ static inline cublasOperation_t convT(cb_transpose trans) {
   }
 }
 
+static const char *estr(cublasStatus_t err) {
+  switch (err) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "(cublas) Operation completed successfully.";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "(cublas) Library not initialized.";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "(cublas) GPU resource allocation failed.";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "(cublas) Invalid value.";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "(cublas) Operation not supported by device.";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "(cublas) Mapping error.";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "(cublas) Execution failed.";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "(cublas) Internal error.";
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "(cublas) Unsupported functionality.";
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "(cublas) License error.";
+  default:
+    return "(cublas) Unknown error.";
+  }
+}
+
+static inline int error_cublas(error *e, const char *msg, cublasStatus_t err) {
+  return error_fmt(e, (err == CUBLAS_STATUS_ARCH_MISMATCH) ? GA_DEVSUP_ERROR : GA_BLAS_ERROR,
+                   "%s: %s%s", msg, estr(err),
+		   err == CUBLAS_STATUS_NOT_INITIALIZED ? " (Possibly because the driver version is too old for the cuda version)" : "");
+}
+
+#define CUBLAS_EXIT_ON_ERROR(ctx, cmd) do {       \
+    cublasStatus_t err = (cmd);                   \
+    if (err != CUBLAS_STATUS_SUCCESS) {           \
+      cuda_exit(ctx);                             \
+      return error_cublas((ctx)->err, #cmd, err); \
+    }                                             \
+  } while(0)
+
 typedef struct _blas_handle {
   cublasHandle_t h;
   GpuKernel sgemvBH_N_a1_b1_small;
@@ -30,14 +73,17 @@ typedef struct _blas_handle {
   GpuKernel dgemvBH_T_a1_b1_small;
   GpuKernel sgerBH_gen_small;
   GpuKernel dgerBH_gen_small;
-  cublasStatus_t err;
+  uint8_t tensorCore;
 } blas_handle;
 
+#define LARGE_VAL(v) (v >= INT_MAX)
+
 static const char *code_sgemvBH_N_a1_b1_small =                         \
-  "extern \"C\"__global__ void sgemv(const float *A[], size_t lda, "    \
-  "                                  const float *x[], size_t incx, "   \
-  "                                  float *y[], size_t incy, "         \
-  "                                  size_t b, size_t m, size_t n) {"   \
+  "#include \"cluda.h\"\n"                                              \
+  "KERNEL void sgemv(const float *A[], size_t lda, "                    \
+  "                  const float *x[], size_t incx, "                   \
+  "                  float *y[], size_t incy, "                         \
+  "                  size_t b, size_t m, size_t n) {"                   \
   "  for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;"     \
   "       p += gridDim.y * blockDim.y) {"                               \
   "    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < m;"   \
@@ -51,49 +97,37 @@ static const char *code_sgemvBH_N_a1_b1_small =                         \
   "        Ap += lda;"                                                  \
   "        xp += incx;"                                                 \
   "      }"                                                             \
-  "     atomicAdd(&y[p][i*incy], yi);"                                  \
+  "     atom_add_fg(&y[p][i*incy], yi);"                                \
   "    }"                                                               \
   "  }"                                                                 \
   "}\n";
 
-static const char *code_sgemvBH_T_a1_b1_small =                         \
-  "extern \"C\" __global__ void sgemv(const float *A[], size_t lda, "   \
-  "                                   const float *x[], size_t incx, "  \
-  "                                   float *y[], size_t incy, "        \
-  "                                   size_t b, size_t m, size_t n) {"  \
-  "  size_t i = blockIdx.x * blockDim.x + threadIdx.x;"                 \
-  "  size_t p = blockIdx.y * blockDim.y + threadIdx.y;"                 \
-  "  if (i >= m || p >= b) return;"                                     \
-  "  float yi = 0.0f;"                                                  \
-  "  const float *Ap = A[p] + i * lda;"                                 \
-  "  const float *xp = x[p];\n"                                         \
-  "  # pragma unroll 32\n"                                              \
-  "  for (size_t j = 0; j < n; j++) {"                                  \
-  "    yi += Ap[j] * xp[0];"                                            \
-  "    xp += incx;"                                                     \
-  "  }"                                                                 \
-  "  atomicAdd(&y[p][i*incy], yi);"                                     \
-  "}\n";
-
-static const char *atomicadd_double =                                   \
-  "__device__ double atomicAdd(double* address, double val) {"          \
-  "  unsigned long long int* address_as_ull ="                          \
-  "  (unsigned long long int*)address;"                                 \
-  "  unsigned long long int old = *address_as_ull, assumed;"            \
-  "  do {"                                                              \
-  "    assumed = old;"                                                  \
-  "    old = atomicCAS(address_as_ull, assumed,"                        \
-  "                    __double_as_longlong(val +"                      \
-  "                    __longlong_as_double(assumed)));"                \
-  "  } while (assumed != old);"                                         \
-  "  return __longlong_as_double(old);"                                 \
+static const char *code_sgemvBH_T_a1_b1_small =         \
+  "#include \"cluda.h\"\n"                              \
+  "KERNEL void sgemv(const float *A[], size_t lda, "    \
+  "                  const float *x[], size_t incx, "   \
+  "                  float *y[], size_t incy, "         \
+  "                  size_t b, size_t m, size_t n) {"   \
+  "  size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \
+  "  size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \
+  "  if (i >= m || p >= b) return;"                     \
+  "  float yi = 0.0f;"                                  \
+  "  const float *Ap = A[p] + i * lda;"                 \
+  "  const float *xp = x[p];\n"                         \
+  "  # pragma unroll 32\n"                              \
+  "  for (size_t j = 0; j < n; j++) {"                  \
+  "    yi += Ap[j] * xp[0];"                            \
+  "    xp += incx;"                                     \
+  "  }"                                                 \
+  "  atom_add_fg(&y[p][i*incy], yi);"                   \
   "}\n";
 
 static const char *code_dgemvBH_N_a1_b1_small =                         \
-  "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, "  \
-  "                                   const double *x[], size_t incx, " \
-  "                                   double *y[], size_t incy, "       \
-  "                                   size_t b, size_t m, size_t n) {"  \
+  "#include \"cluda.h\"\n"                                              \
+  "KERNEL void dgemv(const double *A[], size_t lda, "                   \
+  "                  const double *x[], size_t incx, "                  \
+  "                  double *y[], size_t incy, "                        \
+  "                  size_t b, size_t m, size_t n) {"                   \
   "  for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;"     \
   "       p += gridDim.y * blockDim.y) {"                               \
   "    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < m;"   \
@@ -107,32 +141,34 @@ static const char *code_dgemvBH_N_a1_b1_small =                         \
   "        Ap += lda;"                                                  \
   "        xp += incx;"                                                 \
   "      }"                                                             \
-  "     atomicAdd(&y[p][i*incy], yi);"                                  \
+  "     atom_add_dg(&y[p][i*incy], yi);"                                \
   "    }"                                                               \
   "  }"                                                                 \
   "}\n";
 
-static const char *code_dgemvBH_T_a1_b1_small =                         \
-  "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, "  \
-  "                                   const double *x[], size_t incx, " \
-  "                                   double *y[], size_t incy, "       \
-  "                                   size_t b, size_t m, size_t n) {"  \
-  "  size_t i = blockIdx.x * blockDim.x + threadIdx.x;"                 \
-  "  size_t p = blockIdx.y * blockDim.y + threadIdx.y;"                 \
-  "  if (i >= m || p >= b) return;"                                     \
-  "  double yi = 0.0;"                                                  \
-  "  const double *Ap = A[p] + i * lda;"                                \
-  "  const double *xp = x[p];\n"                                        \
-  "  # pragma unroll 32\n"                                              \
-  "  for (size_t j = 0; j < n; j++) {"                                  \
-  "    yi += Ap[j] * xp[0];"                                            \
-  "    xp += incx;"                                                     \
-  "  }"                                                                 \
-  "  atomicAdd(&y[p][i*incy], yi);"                                     \
+static const char *code_dgemvBH_T_a1_b1_small =         \
+  "#include \"cluda.h\"\n"                              \
+  "KERNEL void dgemv(const double *A[], size_t lda, "   \
+  "                  const double *x[], size_t incx, "  \
+  "                  double *y[], size_t incy, "        \
+  "                  size_t b, size_t m, size_t n) {"   \
+  "  size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \
+  "  size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \
+  "  if (i >= m || p >= b) return;"                     \
+  "  double yi = 0.0;"                                  \
+  "  const double *Ap = A[p] + i * lda;"                \
+  "  const double *xp = x[p];\n"                        \
+  "  # pragma unroll 32\n"                              \
+  "  for (size_t j = 0; j < n; j++) {"                  \
+  "    yi += Ap[j] * xp[0];"                            \
+  "    xp += incx;"                                     \
+  "  }"                                                 \
+  "  atom_add_dg(&y[p][i*incy], yi);"                   \
   "}\n";
 
 static const char *code_sgerBH_gen_small =                              \
-  "extern \"C\" __global__ void _sgerBH_gen_small("                     \
+  "#include \"cluda.h\"\n"                                              \
+  "KERNEL void _sgerBH_gen_small("                                      \
   "    const float *x[], size_t incx,"                                  \
   "    const float *y[], size_t incy,"                                  \
   "    float alpha, float *A[], size_t lda,"                            \
@@ -141,13 +177,14 @@ static const char *code_sgerBH_gen_small =                              \
   "  size_t j = blockIdx.y * blockDim.y + threadIdx.y;"                 \
   "  if (i >= m || j >= n) return;"                                     \
   "  for (size_t p = blockIdx.z; p < b; p += gridDim.z) {"              \
-  "    atomicAdd(&A[p][j * lda + i],"                                   \
-  "              alpha * x[p][i * incx] * y[p][j * incy]);"             \
+  "    atom_add_fg(&A[p][j * lda + i],"                                 \
+  "                alpha * x[p][i * incx] * y[p][j * incy]);"           \
   "  }"                                                                 \
   "}\n";
 
 static const char *code_dgerBH_gen_small =                              \
-  "extern \"C\" __global__ void _dgerBH_gen_small("                     \
+  "#include \"cluda.h\"\n"                                              \
+  "KERNEL void _dgerBH_gen_small("                                      \
   "      const double *x[], size_t incx, "                              \
   "      const double *y[], size_t incy,"                               \
   "      double alpha, double *A[], size_t lda,"                        \
@@ -156,44 +193,63 @@ static const char *code_dgerBH_gen_small =                              \
   "  size_t j = blockIdx.y * blockDim.y + threadIdx.y;"                 \
   "  if (i >= m || j >= n) return;"                                     \
   "  for (size_t p = blockIdx.z; p < b; p += gridDim.z) {"              \
-  "    atomicAdd(&A[p][j * lda + i],"                                   \
-  "              alpha * x[p][i * incx] * y[p][j * incy]);"             \
+  "    atom_add_dg(&A[p][j * lda + i],"                                 \
+  "                alpha * x[p][i * incx] * y[p][j * incy]);"           \
   "  }"                                                                 \
   "}\n";
 
 static int setup(gpucontext *c) {
   cuda_context *ctx = (cuda_context *)c;
   blas_handle *handle;
-  const char *tmp[2];
+  CUdevice dev;
   cublasStatus_t err;
-  int e;
   int types[10];
+  int major, minor;
+  int e;
 
   if (ctx->blas_handle != NULL)
     return GA_NO_ERROR;
 
   handle = calloc(1, sizeof(*handle));
   if (handle == NULL)
-    return GA_MEMORY_ERROR;
-
-  handle->err = CUBLAS_STATUS_SUCCESS;
+    return error_sys(ctx->err, "calloc");
 
   cuda_enter(ctx);
+  {
+    CUresult err;
+    err = cuCtxGetDevice(&dev);
+    if (err != CUDA_SUCCESS) {
+      cuda_exit(ctx);
+      return error_cuda(ctx->err, "cuCtxGetDevice", err);
+    }
+  }
+  GA_CUDA_EXIT_ON_ERROR(ctx, get_cc(dev, &major, &minor, ctx->err));
+
+  /* Only try to use tensor core on cuda 9 and up */
+  if (ctx->major >= 9 && major >= 7 && minor >= 0) {
+    handle->tensorCore = 1;
+  } else {
+    handle->tensorCore = 0;
+  }
+
   err = cublasCreate(&handle->h);
   if (err != CUBLAS_STATUS_SUCCESS) {
     cuda_exit(ctx);
     free(handle);
-    return GA_BLAS_ERROR;
+    return error_cublas(ctx->err, "cublasCreate", err);
   }
 
   err = cublasSetStream(handle->h, ctx->s);
   if (err != CUBLAS_STATUS_SUCCESS) {
-    e = GA_BLAS_ERROR;
+    e = error_cublas(ctx->err, "cublasSetStream", err);
     goto e1;
   }
 
-  cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST);
-  cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED);
+  err = cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST);
+  if (err != CUBLAS_STATUS_SUCCESS) {
+    e = error_cublas(ctx->err, "cublasSetPointerMode", err);
+    goto e1;
+  }
 
   types[0] = GA_BUFFER;
   types[1] = GA_SIZE;
@@ -208,13 +264,9 @@ static int setup(gpucontext *c) {
   if (e != GA_NO_ERROR) goto e1;
   e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, c, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL);
   if (e != GA_NO_ERROR) goto e2;
-  tmp[0] = atomicadd_double;
-  tmp[1] = code_dgemvBH_N_a1_b1_small;
-  e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, c, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
+  e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, c, 1, &code_dgemvBH_N_a1_b1_small, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
   if (e != GA_NO_ERROR) goto e3;
-  tmp[0] = atomicadd_double;
-  tmp[1] = code_dgemvBH_T_a1_b1_small;
-  e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, c, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
+  e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, c, 1, &code_dgemvBH_T_a1_b1_small, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
   if (e != GA_NO_ERROR) goto e4;
 
   types[0] = GA_BUFFER;
@@ -230,9 +282,7 @@ static int setup(gpucontext *c) {
   e = GpuKernel_init(&handle->sgerBH_gen_small, c, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL);
   if (e != GA_NO_ERROR) goto e5;
   types[4] = GA_DOUBLE;
-  tmp[0] = atomicadd_double;
-  tmp[1] = code_dgerBH_gen_small;
-  e = GpuKernel_init(&handle->dgerBH_gen_small, c, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL);
+  e = GpuKernel_init(&handle->dgerBH_gen_small, c, 1, &code_dgerBH_gen_small, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL);
   if (e != GA_NO_ERROR) goto e6;
 
   ctx->blas_handle = handle;
@@ -278,39 +328,6 @@ static void teardown(gpucontext *c) {
   ctx->blas_handle = NULL;
 }
 
-static const char *error(gpucontext *c) {
-  cuda_context *ctx = (cuda_context *)c;
-  blas_handle *handle = (blas_handle *)ctx->blas_handle;
-
-  if (handle != NULL) {
-    switch (handle->err) {
-    case CUBLAS_STATUS_SUCCESS:
-      return "(cublas) Operation completed successfully.";
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "(cublas) Library not initialized.";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "(cublas) GPU ressource allocation failed.";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "(cublas) Invalid value.";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "(cublas) Operation not supported by device.";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "(cublas) Mapping error.";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "(cublas) Execution failed.";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "(cublas) Internal error.";
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "(cublas) Unsupported functionality.";
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "(cublas) License error.";
-    default:
-      return "(cublas) Unknown error.";
-    }
-  }
-  return "Blas handle not initialized, API error.";
-}
-
 static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                  size_t M, size_t N, size_t K, float alpha,
                  gpudata *A, size_t offA, size_t lda,
@@ -326,6 +343,11 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   ASSERT_BUF(B);
   ASSERT_BUF(C);
 
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   if (order == cb_c) {
     /* swap A and B */
     t = N;
@@ -351,17 +373,11 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));
 
-  h->err = cublasSgemm(h->h,
-                       convT(transA), convT(transB), M, N, K,
-                       &alpha, ((float *)A->ptr) + offA, lda,
-                       ((float *)B->ptr) + offB, ldb, &beta,
-                       ((float *)C->ptr) + offC, ldc);
-  if (h->err != CUBLAS_STATUS_SUCCESS) {
-    cuda_exit(ctx);
-    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-      return GA_DEVSUP_ERROR;
-    return GA_BLAS_ERROR;
-  }
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemm(h->h,
+                                        convT(transA), convT(transB), M, N, K,
+                                        &alpha, ((float *)A->ptr) + offA, lda,
+                                        ((float *)B->ptr) + offB, ldb, &beta,
+                                        ((float *)C->ptr) + offC, ldc));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
@@ -386,6 +402,11 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   ASSERT_BUF(B);
   ASSERT_BUF(C);
 
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   if (order == cb_c) {
     /* swap A and B */
     t = N;
@@ -411,17 +432,11 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));
 
-  h->err = cublasDgemm(h->h,
-                       convT(transA), convT(transB), M, N, K,
-                       &alpha, ((double *)A->ptr) + offA, lda,
-                       ((double *)B->ptr) + offB, ldb, &beta,
-                       ((double *)C->ptr) + offC, ldc);
-  if (h->err != CUBLAS_STATUS_SUCCESS) {
-    cuda_exit(ctx);
-    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-      return GA_DEVSUP_ERROR;
-    return GA_BLAS_ERROR;
-  }
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemm(h->h,
+                                        convT(transA), convT(transB), M, N, K,
+                                        &alpha, ((double *)A->ptr) + offA, lda,
+                                        ((double *)B->ptr) + offB, ldb, &beta,
+                                        ((double *)C->ptr) + offC, ldc));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
@@ -436,7 +451,6 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                  gpudata *A, size_t offA, size_t lda,
                  gpudata *B, size_t offB, size_t ldb,
                  float beta, gpudata *C, size_t offC, size_t ldc) {
-#ifdef HAVE_CUBLAS_SGEMMEX
   /* This will use float32 for computation as it's the best we can
    * have right now. In the future when native float16 support will be
    * there we will switch to that. */
@@ -450,6 +464,14 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   ASSERT_BUF(B);
   ASSERT_BUF(C);
 
+  if (cublasSgemmEx == NULL && (cublasGemmEx == NULL || h->tensorCore == 0))
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx|cublasGemmEx unavailable");
+
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   if (order == cb_c) {
     /* swap A and B */
     t = N;
@@ -475,32 +497,109 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));
 
-  h->err = cublasSgemmEx(h->h,
-                         convT(transA), convT(transB), M, N, K,
-                         &alpha, ((uint16_t *)A->ptr) + offA,
-#if CUDA_VERSION >= 8000
-                         CUDA_R_16F,
-#else
-                         CUBLAS_DATA_HALF,
-#endif
-                         lda, ((uint16_t *)B->ptr) + offB,
-#if CUDA_VERSION >= 8000
-                         CUDA_R_16F,
-#else
-                         CUBLAS_DATA_HALF,
-#endif
-                         ldb, &beta, ((uint16_t *)C->ptr) + offC,
-#if CUDA_VERSION >= 8000
-                         CUDA_R_16F,
-#else
-                         CUBLAS_DATA_HALF,
-#endif
-                         ldc);
-  if (h->err != CUBLAS_STATUS_SUCCESS) {
+  if (cublasGemmEx != NULL && h->tensorCore) {
+    CUBLAS_EXIT_ON_ERROR(ctx, cublasGemmEx(h->h, convT(transA), convT(transB),
+					   M, N, K,
+					   &alpha, ((uint16_t *)A->ptr) + offA,
+					   CUDA_R_16F,
+					   lda, ((uint16_t *)B->ptr) + offB,
+					   CUDA_R_16F,
+					   ldb, &beta, ((uint16_t *)C->ptr) + offC,
+					   CUDA_R_16F,
+					   ldc,
+					   CUDA_R_32F,
+					   CUBLAS_GEMM_DFALT_TENSOR_OP));
+  } else {
+    CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemmEx(h->h, convT(transA), convT(transB),
+					    M, N, K,
+					    &alpha, ((uint16_t *)A->ptr) + offA,
+					    CUDA_R_16F,
+					    lda, ((uint16_t *)B->ptr) + offB,
+					    CUDA_R_16F,
+					    ldb, &beta, ((uint16_t *)C->ptr) + offC,
+					    CUDA_R_16F,
+					    ldc));
+  }
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL));
+
+  cuda_exit(ctx);
+  return GA_NO_ERROR;
+}
+
+static int hgemm3D(cb_order order, cb_transpose transA, cb_transpose transB,
+                   size_t M, size_t N, size_t K, float alpha,
+                   gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+                   gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+                   float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+                   size_t batchCount) {
+  cuda_context *ctx;
+  blas_handle *h;
+  size_t  t;
+  ssize_t st;
+  gpudata *T;
+  cb_transpose transT;
+  cublasStatus_t err;
+  ga_half_t halpha, hbeta;
+
+  ASSERT_BUF(A);
+  ASSERT_BUF(B);
+  ASSERT_BUF(C);
+
+  ctx = A->ctx;
+
+  if (cublasHgemmStridedBatched == NULL)
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasHgemmStridedBatched not available in your version of cuBLAS");
+
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
+  h = (blas_handle *)ctx->blas_handle;
+  cuda_enter(ctx);
+
+  if (order == cb_c) {
+    /* swap A and B */
+    t = N;
+    N = M;
+    M = t;
+    T = A;
+    A = B;
+    B = T;
+    t = lda;
+    lda = ldb;
+    ldb = t;
+    t = offA;
+    offA = offB;
+    offB = t;
+    transT = transA;
+    transA = transB;
+    transB = transT;
+    st = strideA;
+    strideA = strideB;
+    strideB = st;
+  }
+
+  halpha = ga_float2half(alpha);
+  hbeta = ga_float2half(beta);
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));
+  err = cublasHgemmStridedBatched(h->h,
+				  convT(transA), convT(transB),
+				  M, N, K, (__half *)&halpha,
+				  ((__half *)A->ptr) + offA, lda, strideA,
+				  ((__half *)B->ptr) + offB, ldb, strideB,
+				  (__half *)&hbeta,
+				  ((__half *)C->ptr) + offC, ldc, strideC,
+				  batchCount);
+  if (err != CUBLAS_STATUS_SUCCESS) {
     cuda_exit(ctx);
-    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-      return GA_DEVSUP_ERROR;
-    return GA_BLAS_ERROR;
+    return error_cublas(ctx->err, "cublasHgemmStridedBatched", err);
   }
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
@@ -509,18 +608,162 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
 
   cuda_exit(ctx);
   return GA_NO_ERROR;
-#else
-  return GA_DEVSUP_ERROR;
-#endif
 }
 
-static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
-                      size_t M, size_t N, size_t K, float alpha,
-                      gpudata **A, size_t *offA, size_t lda,
-                      gpudata **B, size_t *offB, size_t ldb,
-                      float beta, gpudata **C, size_t *offC, size_t ldc,
-                      size_t batchCount) {
-  return GA_DEVSUP_ERROR;
+static int sgemm3D(cb_order order, cb_transpose transA, cb_transpose transB,
+                   size_t M, size_t N, size_t K, float alpha,
+                   gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+                   gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+                   float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+                   size_t batchCount) {
+  cuda_context *ctx;
+  blas_handle *h;
+  size_t  t;
+  ssize_t st;
+  gpudata *T;
+  cb_transpose transT;
+  cublasStatus_t err;
+
+  ASSERT_BUF(A);
+  ASSERT_BUF(B);
+  ASSERT_BUF(C);
+
+  ctx = A->ctx;
+
+  if (cublasSgemmStridedBatched == NULL)
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmStridedBatched not available in your version of cuBLAS");
+
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
+  h = (blas_handle *)ctx->blas_handle;
+  cuda_enter(ctx);
+
+  if (order == cb_c) {
+    /* swap A and B */
+    t = N;
+    N = M;
+    M = t;
+    T = A;
+    A = B;
+    B = T;
+    t = lda;
+    lda = ldb;
+    ldb = t;
+    t = offA;
+    offA = offB;
+    offB = t;
+    transT = transA;
+    transA = transB;
+    transB = transT;
+    st = strideA;
+    strideA = strideB;
+    strideB = st;
+  }
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));
+
+  err = cublasSgemmStridedBatched(h->h,
+				  convT(transA), convT(transB),
+				  M, N, K, &alpha,
+				  ((float *)A->ptr) + offA, (int)lda, strideA,
+				  ((float *)B->ptr) + offB, (int)ldb, strideB,
+				  &beta,
+				  ((float *)C->ptr) + offC, (int)ldc, strideC,
+				  batchCount);
+  if (err != CUBLAS_STATUS_SUCCESS) {
+    cuda_exit(ctx);
+    return error_cublas(ctx->err, "cublasHgemmStridedBatched", err);
+  }
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL));
+
+  cuda_exit(ctx);
+  return GA_NO_ERROR;
+}
+
+static int dgemm3D(cb_order order, cb_transpose transA, cb_transpose transB,
+                   size_t M, size_t N, size_t K, double alpha,
+                   gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+                   gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+                   double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+                   size_t batchCount) {
+  cuda_context *ctx;
+  blas_handle *h;
+  size_t  t;
+  ssize_t st;
+  gpudata *T;
+  cb_transpose transT;
+  cublasStatus_t err;
+
+  ASSERT_BUF(A);
+  ASSERT_BUF(B);
+  ASSERT_BUF(C);
+
+  ctx = A->ctx;
+
+  if (cublasDgemmStridedBatched == NULL)
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasDgemmStridedBatched not available in your version of cuBLAS");
+
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
+  h = (blas_handle *)ctx->blas_handle;
+  cuda_enter(ctx);
+
+  if (order == cb_c) {
+    /* swap A and B */
+    t = N;
+    N = M;
+    M = t;
+    T = A;
+    A = B;
+    B = T;
+    t = lda;
+    lda = ldb;
+    ldb = t;
+    t = offA;
+    offA = offB;
+    offB = t;
+    transT = transA;
+    transA = transB;
+    transB = transT;
+    st = strideA;
+    strideA = strideB;
+    strideB = st;
+  }
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));
+
+  err = cublasDgemmStridedBatched(h->h,
+				  convT(transA), convT(transB),
+				  M, N, K, &alpha,
+				  ((double *)A->ptr) + offA, (int)lda, strideA,
+				  ((double *)B->ptr) + offB, (int)ldb, strideB,
+				  &beta,
+				  ((double *)C->ptr) + offC, (int)ldc, strideC,
+				  batchCount);
+  if (err != CUBLAS_STATUS_SUCCESS) {
+    cuda_exit(ctx);
+    return error_cublas(ctx->err, "cublasDgemmStridedBatched", err);
+  }
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL));
+
+  cuda_exit(ctx);
+  return GA_NO_ERROR;
 }
 
 static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
@@ -537,10 +780,14 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   const size_t threshold = 650;
   cb_transpose transT;
 
-  if (batchCount == 0) return GA_NO_ERROR;
-
   ASSERT_BUF(A[0]);
   ctx = A[0]->ctx;
+
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   h = (blas_handle *)ctx->blas_handle;
   cuda_enter(ctx);
 
@@ -574,19 +821,13 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ));
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL));
 
-      h->err = cublasSgemm(h->h,
-                           convT(transA), convT(transB),
-                           M, N, K, &alpha,
-                           (float*)A[i]->ptr + offA[i], lda,
-                           (float*)B[i]->ptr + offB[i], ldb,
-                           &beta,
-                           (float*)C[i]->ptr + offC[i], ldc);
-      if (h->err != CUBLAS_STATUS_SUCCESS) {
-        cuda_exit(ctx);
-        if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-          return GA_DEVSUP_ERROR;
-        return GA_BLAS_ERROR;
-      }
+      CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemm(h->h,
+                                            convT(transA), convT(transB),
+                                            M, N, K, &alpha,
+                                            ((float*)A[i]->ptr) + offA[i], lda,
+                                            ((float*)B[i]->ptr) + offB[i], ldb,
+                                            &beta,
+                                            ((float*)C[i]->ptr) + offC[i], ldc));
 
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ));
@@ -597,7 +838,9 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     const float **A_l = (const float **)T_l;
     const float **B_l = (const float **)T_l + batchCount;
     float **C_l = T_l + (batchCount * 2);
-    CUdeviceptr Ta, Aa, Ba, Ca;
+    gpudata *Ta;
+    CUdeviceptr Aa, Ba, Ca;
+    cublasStatus_t err;
 
     for (i = 0; i < batchCount; i++) {
       ASSERT_BUF(A[i]);
@@ -611,25 +854,43 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
       C_l[i] = ((float *)C[i]->ptr) + offC[i];
     }
 
-    cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3);
-    Aa = Ta;
-    Ba = Ta + (batchCount * sizeof(float *));
-    Ca = Ta + (batchCount * sizeof(float *) * 2);
-
-    cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3);
-
-    h->err = cublasSgemmBatched(h->h,
-                                convT(transA), convT(transB),
-                                M, N, K, &alpha,
-                                (const float **)Aa, lda,
-                                (const float **)Ba, ldb, &beta,
-                                (float **)Ca, ldc, batchCount);
-    cuMemFree(Ta);
-    if (h->err != CUBLAS_STATUS_SUCCESS) {
+    Ta = gpudata_alloc((gpucontext *)ctx, sizeof(float *) * batchCount * 3,
+                       NULL, 0, NULL);
+    if (Ta == NULL) {
       cuda_exit(ctx);
-      if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-        return GA_DEVSUP_ERROR;
-      return GA_BLAS_ERROR;
+      return ctx->err->code;
+    }
+    Aa = *(CUdeviceptr *)Ta;
+    Ba = Aa + (batchCount * sizeof(float *));
+    Ca = Aa + (batchCount * sizeof(float *) * 2);
+
+    if (gpudata_write(Ta, 0, T_l, sizeof(float *) * batchCount * 3) != GA_NO_ERROR) {
+      gpudata_release(Ta);
+      cuda_exit(ctx);
+      return ctx->err->code;
+    }
+
+    if (cuda_wait(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) {
+      gpudata_release(Ta);
+      cuda_exit(ctx);
+      return ctx->err->code;
+    }
+
+    err = cublasSgemmBatched(h->h,
+                             convT(transA), convT(transB),
+                             M, N, K, &alpha,
+                             (const float **)Aa, lda,
+                             (const float **)Ba, ldb, &beta,
+                             (float **)Ca, ldc, batchCount);
+    if (cuda_record(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) {
+      gpudata_release(Ta);
+      cuda_exit(ctx);
+      return ctx->err->code;
+    }
+    gpudata_release(Ta);
+    if (err != CUBLAS_STATUS_SUCCESS) {
+      cuda_exit(ctx);
+      return error_cublas(ctx->err, "cublasSgemmBatched", err);
     }
 
     for (i = 0; i < batchCount; i++) {
@@ -657,10 +918,14 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   const size_t threshold = 650;
   cb_transpose transT;
 
-  if (batchCount == 0) return GA_NO_ERROR;
-
   ASSERT_BUF(A[0]);
   ctx = A[0]->ctx;
+
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
+      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
+      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   h = (blas_handle *)ctx->blas_handle;
   cuda_enter(ctx);
 
@@ -694,19 +959,13 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ));
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL));
 
-      h->err = cublasDgemm(h->h,
-                           convT(transA), convT(transB),
-                           M, N, K, &alpha,
-                           (double*)A[i]->ptr + offA[i], lda,
-                           (double*)B[i]->ptr + offB[i], ldb,
-                           &beta,
-                           (double*)C[i]->ptr + offC[i], ldc);
-      if (h->err != CUBLAS_STATUS_SUCCESS) {
-        cuda_exit(ctx);
-        if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-          return GA_DEVSUP_ERROR;
-        return GA_BLAS_ERROR;
-      }
+      CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemm(h->h,
+                                            convT(transA), convT(transB),
+                                            M, N, K, &alpha,
+                                            (double*)A[i]->ptr + offA[i], lda,
+                                            (double*)B[i]->ptr + offB[i], ldb,
+                                            &beta,
+                                            (double*)C[i]->ptr + offC[i], ldc));
 
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
       GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ));
@@ -717,7 +976,9 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     const double **A_l = (const double **)T_l;
     const double **B_l = (const double **)T_l + batchCount;
     double **C_l = T_l + (batchCount * 2);
-    CUdeviceptr Ta, Aa, Ba, Ca;
+    gpudata *Ta;
+    CUdeviceptr Aa, Ba, Ca;
+    cublasStatus_t err;
 
     for (i = 0; i < batchCount; i++) {
       ASSERT_BUF(A[i]);
@@ -731,25 +992,45 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
       C_l[i] = ((double *)C[i]->ptr) + offC[i];
     }
 
-    cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3);
-    Aa = Ta;
-    Ba = Ta + (batchCount * sizeof(double *));
-    Ca = Ta + (batchCount * sizeof(double *) * 2);
-
-    cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3);
-
-    h->err = cublasDgemmBatched(h->h,
-                                convT(transA), convT(transB),
-                                M, N, K, &alpha,
-                                (const double **)Aa, lda,
-                                (const double **)Ba, ldb, &beta,
-                                (double **)Ca, ldc, batchCount);
-    cuMemFree(Ta);
-    if (h->err != CUBLAS_STATUS_SUCCESS) {
+    Ta = gpudata_alloc((gpucontext *)ctx, sizeof(double *) * batchCount * 3,
+                       NULL, 0, NULL);
+    if (Ta == NULL) {
       cuda_exit(ctx);
-      if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-        return GA_DEVSUP_ERROR;
-      return GA_BLAS_ERROR;
+      return ctx->err->code;
+    }
+    Aa = *(CUdeviceptr *)Ta;
+    Ba = Aa + (batchCount * sizeof(double *));
+    Ca = Aa + (batchCount * sizeof(double *) * 2);
+
+    if (gpudata_write(Ta, 0, T_l, sizeof(double *) * batchCount * 3) != GA_NO_ERROR) {
+      gpudata_release(Ta);
+      cuda_exit(ctx);
+      return ctx->err->code;
+    }
+
+    if (cuda_wait(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) {
+      gpudata_release(Ta);
+      cuda_exit(ctx);
+      return ctx->err->code;
+    }
+
+    err = cublasDgemmBatched(h->h,
+                             convT(transA), convT(transB),
+                             M, N, K, &alpha,
+                             (const double **)Aa, lda,
+                             (const double **)Ba, ldb, &beta,
+                             (double **)Ca, ldc, batchCount);
+
+    if (cuda_record(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) {
+      gpudata_release(Ta);
+      cuda_exit(ctx);
+      return ctx->err->code;
+    }
+    gpudata_release(Ta);
+
+    if (err != CUBLAS_STATUS_SUCCESS) {
+      cuda_exit(ctx);
+      return error_cublas(ctx->err, "cublasDgemmBatched", err);
     }
 
     for (i = 0; i < batchCount; i++) {
@@ -763,11 +1044,85 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   return GA_NO_ERROR;
 }
 
-static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
-                 float alpha, gpudata *A, size_t offA, size_t lda,
-                 gpudata *X, size_t offX, int incX,
-                 float beta, gpudata *Y, size_t offY, int incY) {
-  return GA_DEVSUP_ERROR;
+static int sdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  cuda_context *ctx = X->ctx;
+  blas_handle *h = (blas_handle *)ctx->blas_handle;
+  cublasPointerMode_t pmode;
+
+  ASSERT_BUF(X);
+  ASSERT_BUF(Y);
+  ASSERT_BUF(Z);
+
+  if (LARGE_VAL(N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
+  cuda_enter(ctx);
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE));
+
+  // we should store dot result on device
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasGetPointerMode(h->h, &pmode));
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE));
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSdot(h->h, N,
+                                       ((float*)X->ptr) + offX, incX,
+                                       ((float*)Y->ptr) + offY, incY,
+                                       ((float*)Z->ptr) + offZ));
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, pmode));
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE));
+
+  cuda_exit(ctx);
+
+  return GA_NO_ERROR;
+}
+
+static int ddot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  cuda_context *ctx = X->ctx;
+  blas_handle *h = (blas_handle *)ctx->blas_handle;
+  cublasPointerMode_t pmode;
+
+  ASSERT_BUF(X);
+  ASSERT_BUF(Y);
+  ASSERT_BUF(Z);
+
+  if (LARGE_VAL(N))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
+
+  cuda_enter(ctx);
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE));
+
+  // we should store dot result on device
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasGetPointerMode(h->h, &pmode));
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE));
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasDdot(h->h, N,
+                                       ((double*)X->ptr) + offX, incX,
+                                       ((double*)Y->ptr) + offY, incY,
+                                       ((double*)Z->ptr) + offZ));
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, pmode));
+
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ));
+  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE));
+
+  cuda_exit(ctx);
+
+  return GA_NO_ERROR;
 }
 
 static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
@@ -782,6 +1137,10 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   ASSERT_BUF(X);
   ASSERT_BUF(Y);
 
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) ||
+      LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   if (order == cb_c) {
     t = N;
     N = M;
@@ -800,17 +1159,11 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL));
 
-  h->err = cublasSgemv(h->h,
-                       convT(transA), M, N, &alpha,
-                       ((float *)A->ptr) + offA, lda,
-                       ((float *)X->ptr) + offX, incX,
-                       &beta, ((float *)Y->ptr) + offY, incY);
-  if (h->err != CUBLAS_STATUS_SUCCESS) {
-    cuda_exit(ctx);
-    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-      return GA_DEVSUP_ERROR;
-    return GA_BLAS_ERROR;
-  }
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemv(h->h,
+                                        convT(transA), M, N, &alpha,
+                                        ((float *)A->ptr) + offA, lda,
+                                        ((float *)X->ptr) + offX, incX,
+                                        &beta, ((float *)Y->ptr) + offY, incY));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
@@ -833,6 +1186,10 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   ASSERT_BUF(X);
   ASSERT_BUF(Y);
 
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) ||
+      LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   if (order == cb_c) {
     t = N;
     N = M;
@@ -851,17 +1208,11 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL));
 
-  h->err = cublasDgemv(h->h,
-                       convT(transA), M, N, &alpha,
-                       ((double *)A->ptr) + offA, lda,
-                       ((double *)X->ptr) + offX, incX,
-                       &beta, ((double *)Y->ptr) + offY, incY);
-  if (h->err != CUBLAS_STATUS_SUCCESS) {
-    cuda_exit(ctx);
-    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-      return GA_DEVSUP_ERROR;
-    return GA_BLAS_ERROR;
-  }
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemv(h->h,
+                                        convT(transA), M, N, &alpha,
+                                        ((double *)A->ptr) + offA, lda,
+                                        ((double *)X->ptr) + offX, incX,
+                                        &beta, ((double *)Y->ptr) + offY, incY));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
@@ -872,15 +1223,6 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   return GA_NO_ERROR;
 }
 
-static int hgemvBatch(cb_order order, cb_transpose transA,
-                      size_t M, size_t N, float alpha,
-                      gpudata **A, size_t *offA, size_t lda,
-                      gpudata **x, size_t *offX, size_t incX,
-                      float beta, gpudata **y, size_t *offY, size_t incY,
-                      size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
-}
-
 static int sgemvBatch(cb_order order, cb_transpose transA,
                       size_t M, size_t N, float alpha,
                       gpudata **A, size_t *offA, size_t lda,
@@ -896,10 +1238,13 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
   gpudata *Aa, *xa, *ya;
   int err;
 
-  if (flags != 0) return GA_INVALID_ERROR;
-  if (batchCount == 0) return GA_NO_ERROR;
+  ASSERT_BUF(A[0]);
 
-  if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR;
+  ctx = A[0]->ctx;
+
+  if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags not set to 0");
+
+  if (alpha != 1.0 || beta != 1.0) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Only alpha = 1 and beta = 1 are supported for now");
 
   if (M < 512) {
     ls[0] = 32;
@@ -928,10 +1273,6 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
     }
   }
 
-  ASSERT_BUF(A[0]);
-
-  ctx = A[0]->ctx;
-
   cuda_enter(ctx);
 
   {
@@ -953,21 +1294,21 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
     }
 
     Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, A_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (Aa == NULL)
-      return err;
+      return ctx->err->code;
     xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, x_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (xa == NULL) {
       cuda_ops.buffer_release(Aa);
-      return err;
+      return ctx->err->code;
     }
     ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, y_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (ya == NULL) {
       cuda_ops.buffer_release(Aa);
       cuda_ops.buffer_release(xa);
-      return err;
+      return ctx->err->code;
     }
   }
 
@@ -982,9 +1323,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1020,10 +1361,13 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
   gpudata *Aa, *xa, *ya;
   int err;
 
-  if (flags != 0) return GA_INVALID_ERROR;
-  if (batchCount == 0) return GA_NO_ERROR;
+  ASSERT_BUF(A[0]);
+
+  ctx = A[0]->ctx;
 
-  if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR;
+  if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags not set to 0");
+
+  if (alpha != 1.0 || beta != 1.0) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Only alpha = 1 and beta = 1 are supported for now");
 
   if (M < 512) {
     ls[0] = 32;
@@ -1052,10 +1396,6 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
     }
   }
 
-  ASSERT_BUF(A[0]);
-
-  ctx = A[0]->ctx;
-
   cuda_enter(ctx);
 
   {
@@ -1077,21 +1417,21 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
     }
 
     Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, A_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (Aa == NULL)
-      return err;
+      return ctx->err->code;
     xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, x_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (xa == NULL) {
       cuda_ops.buffer_release(Aa);
-      return err;
+      return ctx->err->code;
     }
     ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, y_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (ya == NULL) {
       cuda_ops.buffer_release(Aa);
       cuda_ops.buffer_release(xa);
-      return err;
+      return ctx->err->code;
     }
   }
 
@@ -1106,9 +1446,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1131,12 +1471,6 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
 }
 
 
-static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X,
-                size_t offX, int incX, gpudata *Y, size_t offY, int incY,
-                gpudata *A, size_t offA, size_t lda) {
-  return GA_DEVSUP_ERROR;
-}
-
 static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X,
                 size_t offX, int incX, gpudata *Y, size_t offY, int incY,
                 gpudata *A, size_t offA, size_t lda) {
@@ -1149,6 +1483,10 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X,
   ASSERT_BUF(Y);
   ASSERT_BUF(A);
 
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) ||
+      LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   if (order == cb_c) {
     t = M;
     M = N;
@@ -1170,16 +1508,10 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL));
 
-  h->err = cublasSger(h->h, M, N, &alpha,
-                                ((float *)X->ptr) + offX, incX,
-                                ((float *)Y->ptr) + offY, incY,
-                                ((float *)A->ptr) + offA, lda);
-  if (h->err != CUBLAS_STATUS_SUCCESS) {
-    cuda_exit(ctx);
-    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-      return GA_DEVSUP_ERROR;
-    return GA_BLAS_ERROR;
-  }
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasSger(h->h, M, N, &alpha,
+                                       ((float *)X->ptr) + offX, incX,
+                                       ((float *)Y->ptr) + offY, incY,
+                                       ((float *)A->ptr) + offA, lda));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ));
@@ -1202,6 +1534,10 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X,
   ASSERT_BUF(Y);
   ASSERT_BUF(A);
 
+  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) ||
+      LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY))
+    return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface");
+
   if (order == cb_c) {
     t = M;
     M = N;
@@ -1223,16 +1559,10 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL));
 
-  h->err = cublasDger(h->h, M, N, &alpha,
-                                ((double *)X->ptr) + offX, incX,
-                                ((double *)Y->ptr) + offY, incY,
-                                ((double *)A->ptr) + offA, lda);
-  if (h->err != CUBLAS_STATUS_SUCCESS) {
-    cuda_exit(ctx);
-    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
-      return GA_DEVSUP_ERROR;
-    return GA_BLAS_ERROR;
-  }
+  CUBLAS_EXIT_ON_ERROR(ctx, cublasDger(h->h, M, N, &alpha,
+                                       ((double *)X->ptr) + offX, incX,
+                                       ((double *)Y->ptr) + offY, incY,
+                                       ((double *)A->ptr) + offA, lda));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ));
@@ -1243,14 +1573,6 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X,
   return GA_NO_ERROR;
 }
 
-static int hgerBatch(cb_order order, size_t M, size_t N, float alpha,
-                     gpudata **x, size_t *offX, size_t incX,
-                     gpudata **y, size_t *offY, size_t incY,
-                     gpudata **A, size_t *offA, size_t lda,
-                     size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
-}
-
 static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
                      gpudata **x, size_t *offX, size_t incX,
                      gpudata **y, size_t *offY, size_t incY,
@@ -1264,8 +1586,11 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
   gpudata *Aa, *xa, *ya;
   int err;
 
-  if (flags != 0) return GA_INVALID_ERROR;
-  if (batchCount == 0) return GA_NO_ERROR;
+  ASSERT_BUF(x[0]);
+
+  ctx = x[0]->ctx;
+
+  if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0");
 
   if (incX == 1) {
     if (ls[0] > 32) {
@@ -1288,7 +1613,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
   }
   if (gs[0] * gs[1] * gs[2] > 65535) {
     if (gs[0] * gs[1] > 65535)
-      return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "Input too large");
     gs[2] = (65535 / (gs[0] * gs[1]));
   }
 
@@ -1307,10 +1632,6 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
     y = T;
   }
 
-  ASSERT_BUF(x[0]);
-
-  ctx = x[0]->ctx;
-
   cuda_enter(ctx);
 
   {
@@ -1332,21 +1653,21 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
     }
 
     Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, A_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (Aa == NULL)
-      return err;
+      return ctx->err->code;
     xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, x_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (xa == NULL) {
       cuda_ops.buffer_release(Aa);
-      return err;
+      return ctx->err->code;
     }
     ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, y_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (ya == NULL) {
       cuda_ops.buffer_release(Aa);
       cuda_ops.buffer_release(xa);
-      return err;
+      return ctx->err->code;
     }
   }
 
@@ -1361,7 +1682,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
@@ -1374,9 +1695,9 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
 
 
   for (i = 0; i < batchCount; i++) {
-    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
+    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_ALL));
     GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ));
-    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_ALL));
+    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_READ));
   }
 
   cuda_exit(ctx);
@@ -1396,8 +1717,11 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
   gpudata *Aa, *xa, *ya;
   int err;
 
-  if (flags != 0) return GA_INVALID_ERROR;
-  if (batchCount == 0) return GA_NO_ERROR;
+  ASSERT_BUF(x[0]);
+
+  ctx = x[0]->ctx;
+
+  if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0");
 
   if (incX == 1) {
     if (ls[0] > 32) {
@@ -1420,7 +1744,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
   }
   if (gs[0] * gs[1] * gs[2] > 65535) {
     if (gs[0] * gs[1] > 65535)
-      return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "Input too large");
     gs[2] = (65535 / (gs[0] * gs[1]));
   }
 
@@ -1439,10 +1763,6 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
     y = T;
   }
 
-  ASSERT_BUF(x[0]);
-
-  ctx = x[0]->ctx;
-
   cuda_enter(ctx);
 
   {
@@ -1464,21 +1784,21 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
     }
 
     Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, A_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (Aa == NULL)
-      return err;
+      return ctx->err->code;
     xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, x_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (xa == NULL) {
       cuda_ops.buffer_release(Aa);
-      return err;
+      return ctx->err->code;
     }
     ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, y_l,
-                               GA_BUFFER_INIT, &err);
+                               GA_BUFFER_INIT);
     if (ya == NULL) {
       cuda_ops.buffer_release(Aa);
       cuda_ops.buffer_release(xa);
-      return err;
+      return ctx->err->code;
     }
   }
 
@@ -1493,7 +1813,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
@@ -1504,37 +1824,41 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
     return err;
   }
 
-
   for (i = 0; i < batchCount; i++) {
-    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
+    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_ALL));
     GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ));
-    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_ALL));
+    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_READ));
   }
 
   cuda_exit(ctx);
   return GA_NO_ERROR;
 }
 
-GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = {
+gpuarray_blas_ops cublas_ops = {
   setup,
   teardown,
-  error,
-  hgemv, /* TODO */
+  NULL, /* hdot */
+  sdot,
+  ddot,
+  NULL, /* hgemv */
   sgemv,
   dgemv,
   hgemm,
   sgemm,
   dgemm,
-  hger, /* TODO */
+  NULL, /* hger */
   sger,
   dger,
-  hgemmBatch, /* TODO */
+  NULL, /* hgemmBatch */
   sgemmBatch,
   dgemmBatch,
-  hgemvBatch, /* TODO */
+  NULL, /* hgemvBatch */
   sgemvBatch,
   dgemvBatch,
-  hgerBatch, /* TODO */
+  NULL, /* hgerBatch */
   sgerBatch,
-  dgerBatch
+  dgerBatch,
+  hgemm3D,
+  sgemm3D,
+  dgemm3D
 };
diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c
index 0c13a70fef..d85526df1a 100644
--- a/src/gpuarray_blas_opencl_clblas.c
+++ b/src/gpuarray_blas_opencl_clblas.c
@@ -1,11 +1,13 @@
 #include "private.h"
 #include "private_opencl.h"
 
-#include <clBLAS.h>
+#include "loaders/libclblas.h"
 
 #include "gpuarray/buffer_blas.h"
 #include "gpuarray/error.h"
 
+extern const gpuarray_buffer_ops opencl_ops;
+
 static inline clblasOrder convO(cb_order order) {
   switch (order) {
   case cb_row:
@@ -32,13 +34,68 @@ static inline clblasTranspose convT(cb_transpose trans) {
 
 static unsigned int refcnt = 0;
 
-static int setup(gpucontext *ctx) {
-  clblasStatus err;
+static const char *estr(clblasStatus err) {
+  if (err > -900) {
+    if (err == CL_INVALID_DEVICE) {
+      return "Invalid device, or double precision requested on a device that does not support double precision";
+    }
+    return cl_error_string((cl_int)err);
+  }
+  switch (err) {
+  case clblasNotImplemented:
+    return "Unimplemented feature";
+  case clblasNotInitialized:
+    return "Library not initialized";
+  case clblasInvalidMatA:
+    return "matrix A is not a valid memory object";
+  case clblasInvalidMatB:
+    return "matrix B is not a valid memory object";
+  case clblasInvalidMatC:
+    return "matrix C is not a valid memory object";
+  case clblasInvalidVecX:
+    return "vector X is not a valid memory object";
+  case clblasInvalidVecY:
+    return "vector Y is not a valid memory object";
+  case clblasInvalidDim:
+    return "An input dimension (M, N, K) is invalid";
+  case clblasInvalidLeadDimA:
+    return "leading dimension for A must not be less than the size of the first dimension";
+  case clblasInvalidLeadDimB:
+    return "leading dimension for B must not be less than the size of the second dimension";
+  case clblasInvalidLeadDimC:
+    return "leading dimension for C must not be less than the size of the third dimension";
+  case clblasInvalidIncX:
+    return "increment for X must not be 0";
+  case clblasInvalidIncY:
+    return "increment for Y must not be 0";
+  case clblasInsufficientMemMatA:
+    return "memory object for matrix A is too small";
+  case clblasInsufficientMemMatB:
+    return "memory object for matrix B is too small";
+  case clblasInsufficientMemMatC:
+    return "memory object for matrix C is too small";
+  case clblasInsufficientMemVecX:
+    return "memory object for vector X is too small";
+  case clblasInsufficientMemVecY:
+    return "memory object for vector Y is too small";
+  default:
+    return "Unknown error";
+  }
+}
+
+static inline int error_clblas(error *e, const char *msg, clblasStatus err) {
+  return error_fmt(e, GA_BLAS_ERROR, "%s: %s", msg, estr(err));
+}
 
+#define CLB_CHECK(e, cmd) do {                  \
+    clblasStatus err = (cmd);                   \
+    if (err != clblasSuccess)                   \
+      return error_clblas(e, #cmd, err);        \
+  } while (0)
+
+static int setup(gpucontext *ctx) {
   if (refcnt == 0) {
-    err = clblasSetup();
-    if (err != clblasSuccess)
-      return GA_BLAS_ERROR;
+    CLB_CHECK(ctx->err, clblasSetup());
   }
 
   if (ctx->blas_handle == NULL)
@@ -56,10 +113,6 @@ static void teardown(gpucontext *ctx) {
     clblasTeardown();
 }
 
-static const char *error(gpucontext *ctx) {
-  return "(clblas) error in blas call, no details for now.";
-}
-
 #define ARRAY_INIT(A)                           \
   if (A->ev != NULL)                            \
     evl[num_ev++] = A->ev
@@ -70,15 +123,6 @@ static const char *error(gpucontext *ctx) {
   A->ev = ev;                                   \
   clRetainEvent(A->ev)
 
-static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
-                      size_t M, size_t N, size_t K, float alpha,
-                      gpudata **A, size_t *offA, size_t lda,
-                      gpudata **B, size_t *offB, size_t ldb,
-                      float beta, gpudata **C, size_t *offC, size_t ldc,
-                      size_t batchCount) {
-  return GA_DEVSUP_ERROR;
-}
-
 static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                       size_t M, size_t N, size_t K, float alpha,
                       gpudata **A, size_t *offA, size_t lda,
@@ -90,18 +134,18 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   cl_event ev;
   size_t i;
   cl_uint num_ev = 0;
-  clblasStatus err;
 
   for (i = 0; i < batchCount; i++) {
+    num_ev = 0;
     ARRAY_INIT(A[i]);
     ARRAY_INIT(B[i]);
     ARRAY_INIT(C[i]);
-    err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K,
-                      alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
-                      beta, C[i]->buf, offB[i], ldc, 1, &ctx->q,
-                      num_ev, num_ev == 0 ? NULL : evl, &ev);
-    if (err != clblasSuccess)
-      return GA_BLAS_ERROR;
+    CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
+                                    M, N, K,
+                                    alpha, A[i]->buf, offA[i], lda,
+                                    B[i]->buf, offB[i], ldb,
+                                    beta, C[i]->buf, offC[i], ldc, 1, &ctx->q,
+                                    num_ev, num_ev == 0 ? NULL : evl, &ev));
     ARRAY_FINI(A[i]);
     ARRAY_FINI(B[i]);
     ARRAY_FINI(C[i]);
@@ -122,18 +166,18 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   cl_event ev;
   size_t i;
   cl_uint num_ev = 0;
-  clblasStatus err;
 
   for (i = 0; i < batchCount; i++) {
+    num_ev = 0;
     ARRAY_INIT(A[i]);
     ARRAY_INIT(B[i]);
     ARRAY_INIT(C[i]);
-    err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K,
-                      alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
-                      beta, C[i]->buf, offB[i], ldc, 1, &ctx->q,
-                      num_ev, num_ev == 0 ? NULL : evl, &ev);
-    if (err != clblasSuccess)
-      return GA_BLAS_ERROR;
+    CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB),
+                                    M, N, K,
+                                    alpha, A[i]->buf, offA[i], lda,
+                                    B[i]->buf, offB[i], ldb,
+                                    beta, C[i]->buf, offC[i], ldc, 1, &ctx->q,
+                                    num_ev, num_ev == 0 ? NULL : evl, &ev));
     ARRAY_FINI(A[i]);
     ARRAY_FINI(B[i]);
     ARRAY_FINI(C[i]);
@@ -143,62 +187,84 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
   return GA_NO_ERROR;
 }
 
-static int hgemvBatch(cb_order order, cb_transpose transA,
-                      size_t M, size_t N, float alpha,
-                      gpudata **A, size_t *offA, size_t lda,
-                      gpudata **x, size_t *offX, size_t incX,
-                      float beta, gpudata **y, size_t *offY, size_t incY,
-                      size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
-}
+static int sdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  cl_ctx *ctx = X->ctx;
+  clblasStatus err;
+  cl_uint num_ev = 0;
+  cl_event evl[3];
+  cl_event ev;
+  gpudata *wbuf;
 
-static int sgemvBatch(cb_order order, cb_transpose transA,
-                      size_t M, size_t N, float alpha,
-                      gpudata **A, size_t *offA, size_t lda,
-                      gpudata **x, size_t *offX, size_t incX,
-                      float beta, gpudata **y, size_t *offY, size_t incY,
-                      size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
-}
+  wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx,
+                                 N*sizeof(float), NULL, GA_BUFFER_READ_WRITE);
+  if (wbuf == NULL)
+      return ctx->err->code;
 
-static int dgemvBatch(cb_order order, cb_transpose transA,
-                      size_t M, size_t N, double alpha,
-                      gpudata **A, size_t *offA, size_t lda,
-                      gpudata **x, size_t *offX, size_t incX,
-                      double beta, gpudata **y, size_t *offY, size_t incY,
-                      size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
-}
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(Z);
+
+  // TODO: a thread-safe static buffer or allocator?
+  err = clblasSdot(
+          N, Z->buf, offZ,
+          X->buf, offX, incX,
+          Y->buf, offY, incY,
+          wbuf->buf, 1, &ctx->q,
+          num_ev, num_ev ? evl : NULL, &ev);
+  opencl_ops.buffer_release(wbuf);
+  if (err != clblasSuccess)
+    return error_clblas(ctx->err, "clblasSdot", err);
 
-static int hgerBatch(cb_order order, size_t M, size_t N, float alpha,
-                     gpudata **x, size_t *offX, size_t incX,
-                     gpudata **y, size_t *offY, size_t incY,
-                     gpudata **A, size_t *offA, size_t lda,
-                     size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
-}
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(Z);
+  clReleaseEvent(ev);
 
-static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
-                     gpudata **x, size_t *offX, size_t incX,
-                     gpudata **y, size_t *offY, size_t incY,
-                     gpudata **A, size_t *offA, size_t lda,
-                     size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
+  return GA_NO_ERROR;
 }
 
-static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
-                     gpudata **x, size_t *offX, size_t incX,
-                     gpudata **y, size_t *offY, size_t incY,
-                     gpudata **A, size_t *offA, size_t lda,
-                     size_t batchCount, int flags) {
-  return GA_DEVSUP_ERROR;
-}
+static int ddot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  cl_ctx *ctx = X->ctx;
+  clblasStatus err;
+  cl_uint num_ev = 0;
+  cl_event evl[3];
+  cl_event ev;
+  gpudata *wbuf;
 
-static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
-                 float alpha, gpudata *A, size_t offA, size_t lda,
-                 gpudata *X, size_t offX, int incX, float beta,
-                 gpudata *Y, size_t offY, int incY) {
-  return GA_DEVSUP_ERROR;
+  wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx,
+                                 N*sizeof(double), NULL, GA_BUFFER_READ_WRITE);
+  if (wbuf == NULL)
+      return ctx->err->code;
+
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(Z);
+
+  err = clblasDdot(
+          N, Z->buf, offZ,
+          X->buf, offX, incX,
+          Y->buf, offY, incY,
+          wbuf->buf, 1, &ctx->q,
+          num_ev, num_ev ? evl : NULL, &ev);
+  opencl_ops.buffer_release(wbuf);
+  if (err != clblasSuccess)
+    return error_clblas(ctx->err, "clblasDdot", err);
+
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(Z);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
 }
 
 static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
@@ -206,7 +272,6 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                  gpudata *X, size_t offX, int incX, float beta,
                  gpudata *Y, size_t offY, int incY) {
   cl_ctx *ctx = A->ctx;
-  clblasStatus err;
   cl_uint num_ev = 0;
   cl_event evl[3];
   cl_event ev;
@@ -215,12 +280,10 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   ARRAY_INIT(X);
   ARRAY_INIT(Y);
 
-  err = clblasSgemv(convO(order), convT(transA), M, N, alpha,
-                    A->buf, offA, lda, X->buf, offX, incX,
-                    beta, Y->buf, offY, incY, 1, &ctx->q,
-                    num_ev, num_ev == 0 ? NULL : evl, &ev);
-  if (err != clblasSuccess)
-    return GA_BLAS_ERROR;
+  CLB_CHECK(ctx->err, clblasSgemv(convO(order), convT(transA), M, N, alpha,
+                                  A->buf, offA, lda, X->buf, offX, incX,
+                                  beta, Y->buf, offY, incY, 1, &ctx->q,
+                                  num_ev, num_ev == 0 ? NULL : evl, &ev));
 
   ARRAY_FINI(A);
   ARRAY_FINI(X);
@@ -236,7 +299,6 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                  gpudata *X, size_t offX, int incX, double beta,
                  gpudata *Y, size_t offY, int incY) {
   cl_ctx *ctx = A->ctx;
-  clblasStatus err;
   cl_uint num_ev = 0;
   cl_event evl[3];
   cl_event ev;
@@ -245,12 +307,10 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   ARRAY_INIT(X);
   ARRAY_INIT(Y);
 
-  err = clblasDgemv(convO(order), convT(transA), M, N, alpha,
-                    A->buf, offA, lda, X->buf, offX, incX,
-                    beta, Y->buf, offY, incY, 1, &ctx->q,
-                    num_ev, num_ev == 0 ? NULL : evl, &ev);
-  if (err != clblasSuccess)
-    return GA_BLAS_ERROR;
+  CLB_CHECK(ctx->err, clblasDgemv(convO(order), convT(transA), M, N, alpha,
+                                  A->buf, offA, lda, X->buf, offX, incX,
+                                  beta, Y->buf, offY, incY, 1, &ctx->q,
+                                  num_ev, num_ev == 0 ? NULL : evl, &ev));
 
   ARRAY_FINI(A);
   ARRAY_FINI(X);
@@ -261,21 +321,12 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
   return GA_NO_ERROR;
 }
 
-static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
-                 size_t M, size_t N, size_t K, float alpha,
-                 gpudata *A, size_t offA, size_t lda,
-                 gpudata *B, size_t offB, size_t ldb, float beta,
-                 gpudata *C, size_t offC, size_t ldc) {
-  return GA_DEVSUP_ERROR;
-}
-
 static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                  size_t M, size_t N, size_t K, float alpha,
                  gpudata *A, size_t offA, size_t lda,
                  gpudata *B, size_t offB, size_t ldb, float beta,
                  gpudata *C, size_t offC, size_t ldc) {
   cl_ctx *ctx = A->ctx;
-  clblasStatus err;
   cl_uint num_ev = 0;
   cl_event evl[3];
   cl_event ev;
@@ -284,12 +335,11 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   ARRAY_INIT(B);
   ARRAY_INIT(C);
 
-  err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K,
-                    alpha, A->buf, offA, lda, B->buf, offB, ldb,
-                    beta, C->buf, offC, ldc, 1, &ctx->q,
-                    num_ev, num_ev == 0 ? NULL : evl, &ev);
-  if (err != clblasSuccess)
-    return GA_BLAS_ERROR;
+  CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
+                                  M, N, K,
+                                  alpha, A->buf, offA, lda, B->buf, offB, ldb,
+                                  beta, C->buf, offC, ldc, 1, &ctx->q,
+                                  num_ev, num_ev == 0 ? NULL : evl, &ev));
 
   ARRAY_FINI(A);
   ARRAY_FINI(B);
@@ -306,7 +356,6 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                  gpudata *B, size_t offB, size_t ldb, double beta,
                  gpudata *C, size_t offC, size_t ldc) {
   cl_ctx *ctx = A->ctx;
-  clblasStatus err;
   cl_uint num_ev = 0;
   cl_event evl[3];
   cl_event ev;
@@ -315,12 +364,11 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   ARRAY_INIT(B);
   ARRAY_INIT(C);
 
-  err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K,
-                    alpha, A->buf, offA, lda, B->buf, offB, ldb,
-                    beta, C->buf, offC, ldc, 1, &ctx->q,
-                    num_ev, num_ev == 0 ? NULL : evl, &ev);
-  if (err != clblasSuccess)
-    return GA_BLAS_ERROR;
+  CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB),
+                                  M, N, K,
+                                  alpha, A->buf, offA, lda, B->buf, offB, ldb,
+                                  beta, C->buf, offC, ldc, 1, &ctx->q,
+                                  num_ev, num_ev == 0 ? NULL : evl, &ev));
 
   ARRAY_FINI(A);
   ARRAY_FINI(B);
@@ -331,13 +379,6 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   return GA_NO_ERROR;
 }
 
-static int hger(cb_order order, size_t M, size_t N, float alpha,
-                gpudata *X, size_t offX, int incX,
-                gpudata *Y, size_t offY, int incY,
-                gpudata *A, size_t offA, size_t lda) {
-  return GA_DEVSUP_ERROR;
-}
-
 static int sger(cb_order order, size_t M, size_t N, float alpha,
                 gpudata *X, size_t offX, int incX,
                 gpudata *Y, size_t offY, int incY,
@@ -346,17 +387,14 @@ static int sger(cb_order order, size_t M, size_t N, float alpha,
   cl_event evl[3];
   cl_event ev;
   cl_uint num_ev = 0;
-  clblasStatus err;
 
   ARRAY_INIT(X);
   ARRAY_INIT(Y);
   ARRAY_INIT(A);
 
-  err = clblasSger(convO(order), M, N, alpha, X->buf, offX, incX,
-                   Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q,
-                   num_ev, num_ev == 0 ? NULL : evl, &ev);
-  if (err != clblasSuccess)
-    return GA_BLAS_ERROR;
+  CLB_CHECK(ctx->err, clblasSger(convO(order), M, N, alpha, X->buf, offX, incX,
+                                 Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q,
+                                 num_ev, num_ev == 0 ? NULL : evl, &ev));
 
   ARRAY_FINI(X);
   ARRAY_FINI(Y);
@@ -375,17 +413,14 @@ static int dger(cb_order order, size_t M, size_t N, double alpha,
   cl_event evl[3];
   cl_event ev;
   cl_uint num_ev = 0;
-  clblasStatus err;
 
   ARRAY_INIT(X);
   ARRAY_INIT(Y);
   ARRAY_INIT(A);
 
-  err = clblasDger(convO(order), M, N, alpha, X->buf, offX, incX,
-                   Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q,
-                   num_ev, num_ev == 0 ? NULL : evl, &ev);
-  if (err != clblasSuccess)
-    return GA_BLAS_ERROR;
+  CLB_CHECK(ctx->err, clblasDger(convO(order), M, N, alpha, X->buf, offX, incX,
+                                 Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q,
+                                 num_ev, num_ev == 0 ? NULL : evl, &ev));
 
   ARRAY_FINI(X);
   ARRAY_FINI(Y);
@@ -396,26 +431,31 @@ static int dger(cb_order order, size_t M, size_t N, double alpha,
   return GA_NO_ERROR;
 }
 
-GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = {
+gpuarray_blas_ops clblas_ops = {
   setup,
   teardown,
-  error,
-  hgemv, /* TODO */
+  NULL, /* hdot */
+  sdot,
+  ddot,
+  NULL, /* hgemv */
   sgemv,
   dgemv,
-  hgemm, /* TODO */
+  NULL, /* hgemm */
   sgemm,
   dgemm,
-  hger, /* TODO */
+  NULL, /* hger */
   sger,
   dger,
-  hgemmBatch, /* TODO */
+  NULL, /* hgemmBatch */
   sgemmBatch,
   dgemmBatch,
-  hgemvBatch, /* TODO */
-  sgemvBatch, /* TODO */
-  dgemvBatch, /* TODO */
-  hgerBatch, /* TODO */
-  sgerBatch, /* TODO */
-  dgerBatch, /* TODO */
+  NULL, /* hgemvBatch */
+  NULL, /* sgemvBatch */
+  NULL, /* dgemvBatch */
+  NULL, /* hgerBatch */
+  NULL, /* sgerBatch */
+  NULL, /* dgerBatch */
+  NULL, /* hgemm3D */
+  NULL, /* sgemm3D */
+  NULL, /* dgemm3D */
 };
diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c
new file mode 100644
index 0000000000..a320edb4c2
--- /dev/null
+++ b/src/gpuarray_blas_opencl_clblast.c
@@ -0,0 +1,530 @@
+#include "private.h"
+#include "private_opencl.h"
+
+#include "loaders/libclblast.h"
+
+#include "gpuarray/buffer_blas.h"
+#include "gpuarray/error.h"
+
+static inline Layout convO(cb_order order) {
+  switch (order) {
+  case cb_row:
+    return kRowMajor;
+  case cb_column:
+    return kColMajor;
+  default:
+    return -1;
+  }
+}
+
+static inline Transpose convT(cb_transpose trans) {
+  switch (trans) {
+  case cb_no_trans:
+    return kNo;
+  case cb_trans:
+    return kYes;
+  case cb_conj_trans:
+    return kConjugate;
+  default:
+    return -1;
+  }
+}
+
+static const char *estr(CLBlastStatusCode err) {
+  if (err > -1024)
+    return cl_error_string((cl_int)err);
+  switch (err) {
+  case CLBlastNotImplemented:
+    return "Unimplemented feature";
+  case CLBlastInvalidMatrixA:
+    return "matrix A is not a valid memory object";
+  case CLBlastInvalidMatrixB:
+    return "matrix B is not a valid memory object";
+  case CLBlastInvalidMatrixC:
+    return "matrix C is not a valid memory object";
+  case CLBlastInvalidVectorX:
+    return "vector X is not a valid memory object";
+  case CLBlastInvalidVectorY:
+    return "vector Y is not a valid memory object";
+  case CLBlastInvalidDimension:
+    return "An input dimension (M, N, K) is invalid";
+  case CLBlastInvalidLeadDimA:
+    return "leading dimension for A must not be less than the size of the first  dimension";
+  case CLBlastInvalidLeadDimB:
+    return "leading dimension for B must not be less than the size of the second dimension";
+  case CLBlastInvalidLeadDimC:
+    return "leading dimension for C must not be less than the size of the third dimension";
+  case CLBlastInvalidIncrementX:
+    return "increment for X must not be 0";
+  case CLBlastInvalidIncrementY:
+    return "increment for Y must not be 0";
+  case CLBlastInsufficientMemoryA:
+    return "memory object for matrix A is too small";
+  case CLBlastInsufficientMemoryB:
+    return "memory object for matrix B is too small";
+  case CLBlastInsufficientMemoryC:
+    return "memory object for matrix C is too small";
+  case CLBlastInsufficientMemoryX:
+    return "memory object for vector X is too small";
+  case CLBlastInsufficientMemoryY:
+    return "memory object for vector Y is too small";
+  case CLBlastInvalidLocalMemUsage:
+    return "not enough local memory on the device";
+  case CLBlastNoHalfPrecision:
+    return "float16 is not supported on this device";
+  case CLBlastNoDoublePrecision:
+    return "float64 is not supported on this device";
+  case CLBlastInvalidVectorScalar:
+    return "unit-sized vector is not a valid memory object";
+  case CLBlastInsufficientMemoryScalar:
+    return "memory object for unit-sized vector is too small";
+  case CLBlastDatabaseError:
+    return "device entry not in database";
+  case CLBlastUnknownError:
+    return "Unspecified error";
+  case CLBlastUnexpectedError:
+    return "Unexpected error";
+  default:
+    return "Unknown error";
+  }
+}
+
+static inline int error_clblast(error *e, const char *msg,
+                                CLBlastStatusCode err) {
+  return error_fmt(e, GA_BLAS_ERROR, "%s: %s", msg, estr(err));
+}
+
+#define CLBT_CHECK(e, cmd) do {                 \
+    CLBlastStatusCode err = (cmd);              \
+    if (err != kSuccess)                        \
+      return error_clblast(e, #cmd, err);       \
+  } while (0)
+
+static int setup(gpucontext *ctx) {
+  return GA_NO_ERROR;
+}
+
+static void teardown(gpucontext *ctx) {
+}
+
+#define ARRAY_INIT(A)                           \
+  if (A->ev != NULL)                            \
+    clWaitForEvents(1, &A->ev)
+
+#define ARRAY_FINI(A)                           \
+  if (A->ev != NULL)                            \
+    clReleaseEvent(A->ev);                      \
+  A->ev = ev;                                   \
+  clRetainEvent(A->ev)
+
+static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
+                      size_t M, size_t N, size_t K, float alpha,
+                      gpudata **A, size_t *offA, size_t lda,
+                      gpudata **B, size_t *offB, size_t ldb,
+                      float beta, gpudata **C, size_t *offC, size_t ldc,
+                      size_t batchCount) {
+  cl_ctx *ctx = A[0]->ctx;
+  cl_event ev;
+  size_t i;
+
+  for (i = 0; i < batchCount; i++) {
+    ARRAY_INIT(A[i]);
+    ARRAY_INIT(B[i]);
+    ARRAY_INIT(C[i]);
+    CLBT_CHECK(ctx->err, CLBlastHgemm(convO(order), convT(transA),
+                                      convT(transB), M, N, K,
+                                      float_to_half(alpha),
+                                      A[i]->buf, offA[i], lda,
+                                      B[i]->buf, offB[i], ldb,
+                                      float_to_half(beta),
+                                      C[i]->buf, offC[i], ldc, &ctx->q, &ev));
+    ARRAY_FINI(A[i]);
+    ARRAY_FINI(B[i]);
+    ARRAY_FINI(C[i]);
+    clReleaseEvent(ev);
+  }
+
+  return GA_NO_ERROR;
+}
+
+static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
+                      size_t M, size_t N, size_t K, float alpha,
+                      gpudata **A, size_t *offA, size_t lda,
+                      gpudata **B, size_t *offB, size_t ldb,
+                      float beta, gpudata **C, size_t *offC, size_t ldc,
+                      size_t batchCount) {
+  cl_ctx *ctx = A[0]->ctx;
+  cl_event ev;
+  size_t i;
+
+  for (i = 0; i < batchCount; i++) {
+    ARRAY_INIT(A[i]);
+    ARRAY_INIT(B[i]);
+    ARRAY_INIT(C[i]);
+    CLBT_CHECK(ctx->err, CLBlastSgemm(convO(order), convT(transA),
+                                      convT(transB), M, N, K,
+                                      alpha, A[i]->buf, offA[i], lda,
+                                      B[i]->buf, offB[i], ldb, beta,
+                                      C[i]->buf, offC[i], ldc, &ctx->q, &ev));
+    ARRAY_FINI(A[i]);
+    ARRAY_FINI(B[i]);
+    ARRAY_FINI(C[i]);
+    clReleaseEvent(ev);
+  }
+
+  return GA_NO_ERROR;
+}
+
+static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
+                      size_t M, size_t N, size_t K, double alpha,
+                      gpudata **A, size_t *offA, size_t lda,
+                      gpudata **B, size_t *offB, size_t ldb,
+                      double beta, gpudata **C, size_t *offC, size_t ldc,
+                      size_t batchCount) {
+  cl_ctx *ctx = A[0]->ctx;
+  cl_event ev;
+  size_t i;
+
+  for (i = 0; i < batchCount; i++) {
+    ARRAY_INIT(A[i]);
+    ARRAY_INIT(B[i]);
+    ARRAY_INIT(C[i]);
+    CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA),
+                                      convT(transB), M, N, K,
+                                      alpha, A[i]->buf, offA[i], lda,
+                                      B[i]->buf, offB[i], ldb, beta,
+                                      C[i]->buf, offC[i], ldc, &ctx->q, &ev));
+    ARRAY_FINI(A[i]);
+    ARRAY_FINI(B[i]);
+    ARRAY_FINI(C[i]);
+    clReleaseEvent(ev);
+  }
+
+  return GA_NO_ERROR;
+}
+
+static int hdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  cl_ctx *ctx = X->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(Z);
+
+  CLBT_CHECK(ctx->err, CLBlastHdot(N, Z->buf, offZ, X->buf, offX, incX,
+                                   Y->buf, offY, incY, &ctx->q, &ev));
+
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(Z);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int sdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  cl_ctx *ctx = X->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(Z);
+
+  CLBT_CHECK(ctx->err, CLBlastSdot(N, Z->buf, offZ, X->buf, offX, incX,
+                                   Y->buf, offY, incY, &ctx->q, &ev));
+
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(Z);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int ddot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  cl_ctx *ctx = X->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(Z);
+
+  CLBT_CHECK(ctx->err, CLBlastDdot(N, Z->buf, offZ, X->buf, offX, incX,
+                                   Y->buf, offY, incY, &ctx->q, &ev));
+
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(Z);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
+                 float alpha, gpudata *A, size_t offA, size_t lda,
+                 gpudata *X, size_t offX, int incX, float beta,
+                 gpudata *Y, size_t offY, int incY) {
+  cl_ctx *ctx = A->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(A);
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+
+  CLBT_CHECK(ctx->err, CLBlastHgemv(convO(order), convT(transA), M, N,
+                                    float_to_half(alpha),
+                                    A->buf, offA, lda, X->buf, offX, incX,
+                                    float_to_half(beta),
+                                    Y->buf, offY, incY, &ctx->q, &ev));
+
+  ARRAY_FINI(A);
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
+                 float alpha, gpudata *A, size_t offA, size_t lda,
+                 gpudata *X, size_t offX, int incX, float beta,
+                 gpudata *Y, size_t offY, int incY) {
+  cl_ctx *ctx = A->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(A);
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+
+  CLBT_CHECK(ctx->err, CLBlastSgemv(convO(order), convT(transA), M, N, alpha,
+                                    A->buf, offA, lda, X->buf, offX, incX,
+                                    beta, Y->buf, offY, incY, &ctx->q, &ev));
+
+  ARRAY_FINI(A);
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
+                 double alpha, gpudata *A, size_t offA, size_t lda,
+                 gpudata *X, size_t offX, int incX, double beta,
+                 gpudata *Y, size_t offY, int incY) {
+  cl_ctx *ctx = A->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(A);
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+
+  CLBT_CHECK(ctx->err, CLBlastDgemv(convO(order), convT(transA), M, N, alpha,
+                                    A->buf, offA, lda, X->buf, offX, incX,
+                                    beta, Y->buf, offY, incY, &ctx->q, &ev));
+
+  ARRAY_FINI(A);
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
+                 size_t M, size_t N, size_t K, float alpha,
+                 gpudata *A, size_t offA, size_t lda,
+                 gpudata *B, size_t offB, size_t ldb, float beta,
+                 gpudata *C, size_t offC, size_t ldc) {
+  cl_ctx *ctx = A->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(A);
+  ARRAY_INIT(B);
+  ARRAY_INIT(C);
+
+  CLBT_CHECK(ctx->err, CLBlastHgemm(convO(order), convT(transA), convT(transB),
+                                    M, N, K, float_to_half(alpha),
+                                    A->buf, offA, lda, B->buf, offB, ldb,
+                                    float_to_half(beta), C->buf, offC, ldc,
+                                    &ctx->q, &ev));
+
+  ARRAY_FINI(A);
+  ARRAY_FINI(B);
+  ARRAY_FINI(C);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
+                 size_t M, size_t N, size_t K, float alpha,
+                 gpudata *A, size_t offA, size_t lda,
+                 gpudata *B, size_t offB, size_t ldb, float beta,
+                 gpudata *C, size_t offC, size_t ldc) {
+  cl_ctx *ctx = A->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(A);
+  ARRAY_INIT(B);
+  ARRAY_INIT(C);
+
+  CLBT_CHECK(ctx->err, CLBlastSgemm(convO(order), convT(transA), convT(transB),
+                                    M, N, K, alpha,
+                                    A->buf, offA, lda, B->buf, offB, ldb,
+                                    beta, C->buf, offC, ldc, &ctx->q, &ev));
+
+  ARRAY_FINI(A);
+  ARRAY_FINI(B);
+  ARRAY_FINI(C);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
+                 size_t M, size_t N, size_t K, double alpha,
+                 gpudata *A, size_t offA, size_t lda,
+                 gpudata *B, size_t offB, size_t ldb, double beta,
+                 gpudata *C, size_t offC, size_t ldc) {
+  cl_ctx *ctx = A->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(A);
+  ARRAY_INIT(B);
+  ARRAY_INIT(C);
+
+  CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB),
+                                    M, N, K, alpha,
+                                    A->buf, offA, lda, B->buf, offB, ldb,
+                                    beta, C->buf, offC, ldc, &ctx->q, &ev));
+
+  ARRAY_FINI(A);
+  ARRAY_FINI(B);
+  ARRAY_FINI(C);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int hger(cb_order order, size_t M, size_t N, float alpha,
+                gpudata *X, size_t offX, int incX,
+                gpudata *Y, size_t offY, int incY,
+                gpudata *A, size_t offA, size_t lda) {
+  cl_ctx *ctx = X->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(A);
+
+  CLBT_CHECK(ctx->err, CLBlastHger(convO(order), M, N, float_to_half(alpha),
+                                   X->buf, offX, incX, Y->buf, offY, incY,
+                                   A->buf, offA, lda, &ctx->q, &ev));
+
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(A);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int sger(cb_order order, size_t M, size_t N, float alpha,
+                gpudata *X, size_t offX, int incX,
+                gpudata *Y, size_t offY, int incY,
+                gpudata *A, size_t offA, size_t lda) {
+  cl_ctx *ctx = X->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(A);
+
+  CLBT_CHECK(ctx->err, CLBlastSger(convO(order), M, N, alpha,
+                                   X->buf, offX, incX, Y->buf, offY, incY,
+                                   A->buf, offA, lda, &ctx->q, &ev));
+
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(A);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+static int dger(cb_order order, size_t M, size_t N, double alpha,
+                gpudata *X, size_t offX, int incX,
+                gpudata *Y, size_t offY, int incY,
+                gpudata *A, size_t offA, size_t lda) {
+  cl_ctx *ctx = X->ctx;
+  cl_event ev;
+
+  ARRAY_INIT(X);
+  ARRAY_INIT(Y);
+  ARRAY_INIT(A);
+
+  CLBT_CHECK(ctx->err, CLBlastDger(convO(order), M, N, alpha,
+                                   X->buf, offX, incX, Y->buf, offY, incY,
+                                   A->buf, offA, lda, &ctx->q, &ev));
+
+  ARRAY_FINI(X);
+  ARRAY_FINI(Y);
+  ARRAY_FINI(A);
+
+  clReleaseEvent(ev);
+
+  return GA_NO_ERROR;
+}
+
+gpuarray_blas_ops clblast_ops = {
+  setup,
+  teardown,
+  hdot,
+  sdot,
+  ddot,
+  hgemv,
+  sgemv,
+  dgemv,
+  hgemm,
+  sgemm,
+  dgemm,
+  hger,
+  sger,
+  dger,
+  hgemmBatch,
+  sgemmBatch,
+  dgemmBatch,
+  NULL, /* hgemvBatch */
+  NULL, /* sgemvBatch */
+  NULL, /* dgemvBatch */
+  NULL, /* hgerBatch */
+  NULL, /* sgerBatch */
+  NULL, /* dgerBatch */
+  NULL, /* hgemm3D */
+  NULL, /* sgemm3D */
+  NULL, /* dgemm3D */
+};
diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c
index c8fb2008de..9eb9ca0ce4 100644
--- a/src/gpuarray_buffer.c
+++ b/src/gpuarray_buffer.c
@@ -6,31 +6,24 @@
 #include "gpuarray/buffer_collectives.h"
 #include "gpuarray/error.h"
 
+#include "util/error.h"
 #include "private.h"
 
-#ifdef WITH_CUDA
 extern const gpuarray_buffer_ops cuda_ops;
-#endif
-#ifdef WITH_OPENCL
 extern const gpuarray_buffer_ops opencl_ops;
-#endif
 
 const gpuarray_buffer_ops *gpuarray_get_ops(const char *name) {
-#ifdef WITH_CUDA
   if (strcmp("cuda", name) == 0) return &cuda_ops;
-#endif
-#ifdef WITH_OPENCL
   if (strcmp("opencl", name) == 0) return &opencl_ops;
-#endif
   return NULL;
 }
 
-#define FAIL(v, e) { if (ret) *ret = e; return v; }
+#define FAIL(v, e) { if (ret) *ret = (e)->code; return v; }
 
 int gpu_get_platform_count(const char* name, unsigned int* platcount) {
   const gpuarray_buffer_ops* ops = gpuarray_get_ops(name);
   if (ops == NULL) {
-    return GA_INVALID_ERROR;
+    return error_set(global_err, GA_INVALID_ERROR, "Invalid platform");
   }
   return ops->get_platform_count(platcount);
 }
@@ -39,27 +32,91 @@ int gpu_get_device_count(const char* name, unsigned int platform,
                          unsigned int* devcount) {
   const gpuarray_buffer_ops* ops = gpuarray_get_ops(name);
   if (ops == NULL) {
-    return GA_INVALID_ERROR;
+    return error_set(global_err, GA_INVALID_ERROR, "Invalid platform");
   }
   return ops->get_device_count(platform, devcount);
 }
 
-gpucontext *gpucontext_init(const char *name, int dev, int flags, int *ret) {
-  gpucontext *res;
+int gpucontext_props_new(gpucontext_props **res) {
+  gpucontext_props *r = calloc(1, sizeof(gpucontext_props));
+  if (r == NULL) return error_sys(global_err, "calloc");
+  r->dev = -1;
+  r->sched = GA_CTX_SCHED_AUTO;
+  r->flags = 0;
+  r->kernel_cache_path = NULL;
+  r->initial_cache_size = 0;
+  r->max_cache_size = (size_t)-1;
+  *res = r;
+  return GA_NO_ERROR;
+}
+
+int gpucontext_props_cuda_dev(gpucontext_props *p, int devno) {
+  p->dev = devno;
+  return GA_NO_ERROR;
+}
+
+int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno) {
+  p->dev = (platno << 16) | devno;
+  return GA_NO_ERROR;
+}
+
+int gpucontext_props_sched(gpucontext_props *p, int sched) {
+  switch (sched) {
+  case GA_CTX_SCHED_MULTI:
+  case GA_CTX_SCHED_AUTO:
+  case GA_CTX_SCHED_SINGLE:
+    p->sched = sched;
+    break;
+  default:
+    return error_fmt(global_err, GA_INVALID_ERROR, "Invalid value for sched: %d", sched);
+  }
+
+  if (sched == GA_CTX_SCHED_MULTI)
+    FLSET(p->flags, GA_CTX_MULTI_THREAD);
+  else
+    FLCLR(p->flags, GA_CTX_MULTI_THREAD);
+
+  return GA_NO_ERROR;
+}
+
+int gpucontext_props_set_single_stream(gpucontext_props *p) {
+  p->flags |= GA_CTX_SINGLE_STREAM;
+  return GA_NO_ERROR;
+}
+
+int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path) {
+  p->kernel_cache_path = path;
+  return GA_NO_ERROR;
+}
+
+int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max) {
+  if (initial > max)
+    return error_set(global_err, GA_VALUE_ERROR, "Initial size can't be bigger than max size");
+  p->initial_cache_size = initial;
+  p->max_cache_size = max;
+  return GA_NO_ERROR;
+}
+
+void gpucontext_props_del(gpucontext_props *p) {
+  free(p);
+}
+
+int gpucontext_init(gpucontext **res, const char *name, gpucontext_props *p) {
   const gpuarray_buffer_ops *ops = gpuarray_get_ops(name);
-  if (ops == NULL)
-    FAIL(NULL, GA_INVALID_ERROR);
-  res = ops->buffer_init(dev, flags, ret);
-  if (res == NULL)
-    return NULL;
-  res->ops = ops;
-  if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, &res->blas_ops) != GA_NO_ERROR)
-    res->blas_ops = NULL;
-  res->blas_handle = NULL;
-  if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, &res->comm_ops) != GA_NO_ERROR)
-    res->comm_ops = NULL;
-  res->extcopy_cache = NULL;
-  return res;
+  gpucontext *r;
+  if (ops == NULL) {
+    gpucontext_props_del(p);
+    return global_err->code;
+  }
+  if (p == NULL && gpucontext_props_new(&p) != GA_NO_ERROR)
+    return global_err->code;
+  r = ops->buffer_init(p);
+  gpucontext_props_del(p);
+  if (r == NULL) return global_err->code;
+  r->ops = ops;
+  r->extcopy_cache = NULL;
+  *res = r;
+  return GA_NO_ERROR;
 }
 
 void gpucontext_deref(gpucontext *ctx) {
@@ -77,22 +134,17 @@ int gpucontext_property(gpucontext *ctx, int prop_id, void *res) {
 }
 
 const char *gpucontext_error(gpucontext *ctx, int err) {
-  if (ctx != NULL) {
-    switch (err) {
-    case GA_IMPL_ERROR:
-      return ctx->ops->ctx_error(ctx);
-    case GA_BLAS_ERROR:
-      return gpublas_error(ctx);
-    case GA_COMM_ERROR:
-      return gpucomm_error(ctx);
-    }
-  }
-  return gpuarray_error_str(err);
+  if (ctx == NULL)
+    return global_err->msg;
+  else
+    return ctx->ops->ctx_error(ctx);
 }
 
 gpudata *gpudata_alloc(gpucontext *ctx, size_t sz, void *data, int flags,
                        int *ret) {
-  return ctx->ops->buffer_alloc(ctx, sz, data, flags, ret);
+  gpudata *res = ctx->ops->buffer_alloc(ctx, sz, data, flags);
+  if (res == NULL && ret) *ret = ctx->err->code;
+  return res;
 }
 
 void gpudata_retain(gpudata *b) {
@@ -100,11 +152,15 @@ void gpudata_retain(gpudata *b) {
 }
 
 void gpudata_release(gpudata *b) {
-  ((partial_gpudata *)b)->ctx->ops->buffer_release(b);
+  if (b)
+    ((partial_gpudata *)b)->ctx->ops->buffer_release(b);
 }
 
 int gpudata_share(gpudata *a, gpudata *b, int *ret) {
-  return ((partial_gpudata *)a)->ctx->ops->buffer_share(a, b, ret);
+  int res = ((partial_gpudata *)a)->ctx->ops->buffer_share(a, b);
+  if (res == -1 && ret)
+    *ret = ((partial_gpudata *)a)->ctx->err->code;
+  return res;
 }
 
 int gpudata_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff,
@@ -131,8 +187,10 @@ int gpudata_transfer(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff,
 
   /* Fallback to host copy */
   tmp = malloc(sz);
-  if (tmp == NULL)
-    return GA_MEMORY_ERROR;
+  if (tmp == NULL) {
+    error_sys(src_ctx->err, "malloc");
+    return error_sys(dst_ctx->err, "malloc");
+  }
   res = src_ctx->ops->buffer_read(tmp, src, srcoff, sz);
   if (res != GA_NO_ERROR) {
     free(tmp);
@@ -170,8 +228,13 @@ gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count,
                           const char *fname, unsigned int numargs,
                           const int *typecodes, int flags, int *ret,
                           char **err_str) {
-  return ctx->ops->kernel_alloc(ctx, count, strings, lengths, fname, numargs,
-                                typecodes, flags, ret, err_str);
+  gpukernel *res = NULL;
+  int err;
+  err = ctx->ops->kernel_alloc(&res, ctx, count, strings, lengths, fname,
+                               numargs, typecodes, flags, err_str);
+  if (err != GA_NO_ERROR && ret != NULL)
+    *ret = ctx->err->code;
+  return res;
 }
 
 void gpukernel_retain(gpukernel *k) {
@@ -186,16 +249,12 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) {
   return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);
 }
 
-int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls,
-                   const size_t *gs, size_t shared, void **args) {
-  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,
+int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
+                   const size_t *ls, size_t shared, void **args) {
+  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,
                                                          shared, args);
 }
 
-int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) {
-  return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj);
-}
-
 int gpukernel_property(gpukernel *k, int prop_id, void *res) {
   return ((partial_gpukernel *)k)->ctx->ops->property(NULL, NULL, k, prop_id,
                                                       res);
diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c
index 417027e850..bbe4b96039 100644
--- a/src/gpuarray_buffer_blas.c
+++ b/src/gpuarray_buffer_blas.c
@@ -4,19 +4,57 @@
 
 int gpublas_setup(gpucontext *ctx) {
   if (ctx->blas_ops == NULL)
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Missing Blas library");
   return ctx->blas_ops->setup(ctx);
 }
 
 void gpublas_teardown(gpucontext *ctx) {
   if (ctx->blas_ops != NULL)
-    return ctx->blas_ops->teardown(ctx);
+    ctx->blas_ops->teardown(ctx);
 }
 
 const char *gpublas_error(gpucontext *ctx) {
-  if (ctx->blas_ops != NULL)
-    return ctx->blas_ops->error(ctx);
-  return "No blas ops available, API error.";
+  return ctx->err->msg;
+}
+
+#define BLAS_OP(buf, name, args)                                        \
+  gpucontext *ctx = gpudata_context(buf);                               \
+  if (ctx->blas_ops->name)                                              \
+    return ctx->blas_ops->name args;                                    \
+  else                                                                  \
+    return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by device or missing library: %s", #name)
+
+#define BLAS_OPF(buf, name, args)                                       \
+  gpucontext *ctx = gpudata_context(buf);                               \
+  if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \
+  if (ctx->blas_ops->name)						\
+    return ctx->blas_ops->name args;                                    \
+  else                                                                  \
+    return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by device or missing library: %s", #name)
+
+
+int gpublas_hdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  BLAS_OP(X, hdot, (N, X, offX, incX, Y, offY, incY, Z, offZ));
+}
+
+int gpublas_sdot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  BLAS_OP(X, sdot, (N, X, offX, incX, Y, offY, incY, Z, offZ));
+}
+
+int gpublas_ddot(
+        size_t N,
+        gpudata *X, size_t offX, size_t incX,
+        gpudata *Y, size_t offY, size_t incY,
+        gpudata *Z, size_t offZ) {
+  BLAS_OP(X, ddot, (N, X, offX, incX, Y, offY, incY, Z, offZ));
 }
 
 int gpublas_hgemv(cb_order order, cb_transpose transA,
@@ -25,9 +63,8 @@ int gpublas_hgemv(cb_order order, cb_transpose transA,
                   gpudata *X, size_t offX, int incX,
                   float beta,
                   gpudata *Y, size_t offY, int incY) {
-  return gpudata_context(A)->blas_ops->hgemv(
-    order, transA, M, N, alpha, A, offA, lda,
-    X, offX, incX, beta, Y, offY, incY);
+  BLAS_OP(A, hgemv, (order, transA, M, N, alpha, A, offA, lda,
+                     X, offX, incX, beta, Y, offY, incY));
 }
 
 int gpublas_sgemv(cb_order order, cb_transpose transA,
@@ -36,9 +73,8 @@ int gpublas_sgemv(cb_order order, cb_transpose transA,
                   gpudata *X, size_t offX, int incX,
                   float beta,
                   gpudata *Y, size_t offY, int incY) {
-  return gpudata_context(A)->blas_ops->sgemv(
-    order, transA, M, N, alpha, A, offA, lda,
-    X, offX, incX, beta, Y, offY, incY);
+  BLAS_OP(A, sgemv, (order, transA, M, N, alpha, A, offA, lda,
+                     X, offX, incX, beta, Y, offY, incY));
 }
 
 int gpublas_dgemv(cb_order order, cb_transpose transA,
@@ -47,9 +83,8 @@ int gpublas_dgemv(cb_order order, cb_transpose transA,
                   gpudata *X, size_t offX, int incX,
                   double beta,
                   gpudata *Y, size_t offY, int incY) {
-  return gpudata_context(A)->blas_ops->dgemv(
-    order, transA, M, N, alpha, A, offA, lda,
-    X, offX, incX, beta, Y, offY, incY);
+  BLAS_OP(A, dgemv, (order, transA, M, N, alpha, A, offA, lda,
+                     X, offX, incX, beta, Y, offY, incY));
 }
 
 int gpublas_hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
@@ -57,9 +92,8 @@ int gpublas_hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                   gpudata *A, size_t offA, size_t lda,
                   gpudata *B, size_t offB, size_t ldb,
                   float beta, gpudata *C, size_t offC, size_t ldc) {
-  return gpudata_context(A)->blas_ops->hgemm(
-    order, transA, transB, M, N, K, alpha, A, offA, lda,
-    B, offB, ldb, beta, C, offC, ldc);
+  BLAS_OP(A, hgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda,
+                     B, offB, ldb, beta, C, offC, ldc));
 }
 
 int gpublas_sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
@@ -67,9 +101,8 @@ int gpublas_sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                   gpudata *A, size_t offA, size_t lda,
                   gpudata *B, size_t offB, size_t ldb,
                   float beta, gpudata *C, size_t offC, size_t ldc) {
-  return gpudata_context(A)->blas_ops->sgemm(
-    order, transA, transB, M, N, K, alpha, A, offA, lda,
-    B, offB, ldb, beta, C, offC, ldc);
+  BLAS_OP(A, sgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda,
+                     B, offB, ldb, beta, C, offC, ldc));
 }
 
 int gpublas_dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
@@ -77,47 +110,63 @@ int gpublas_dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                   gpudata *A, size_t offA, size_t lda,
                   gpudata *B, size_t offB, size_t ldb,
                   double beta, gpudata *C, size_t offC, size_t ldc) {
-  return gpudata_context(A)->blas_ops->dgemm(
-    order, transA, transB, M, N, K, alpha, A, offA, lda,
-    B, offB, ldb, beta, C, offC, ldc);
+  BLAS_OP(A, dgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda,
+                     B, offB, ldb, beta, C, offC, ldc));
 }
 
 int gpublas_hger(cb_order order, size_t M, size_t N, float alpha,
                  gpudata *X, size_t offX, int incX,
                  gpudata *Y, size_t offY, int incY,
                  gpudata *A, size_t offA, size_t lda) {
-  return gpudata_context(X)->blas_ops->hger(
-    order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda);
+  BLAS_OP(X, hger,
+          (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda));
 }
 
 int gpublas_sger(cb_order order, size_t M, size_t N, float alpha,
                  gpudata *X, size_t offX, int incX,
                  gpudata *Y, size_t offY, int incY,
                  gpudata *A, size_t offA, size_t lda) {
-  return gpudata_context(X)->blas_ops->sger(
-    order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda);
+  BLAS_OP(X, sger,
+          (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda));
 }
 
 int gpublas_dger(cb_order order, size_t M, size_t N, double alpha,
                  gpudata *X, size_t offX, int incX,
                  gpudata *Y, size_t offY, int incY,
                  gpudata *A, size_t offA, size_t lda) {
-  return gpudata_context(X)->blas_ops->dger(
-    order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda);
+  BLAS_OP(X, dger,
+          (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda));
 }
 
+#define BLAS_OPB(l, name, args)                                         \
+  gpucontext *ctx;                                                      \
+  if (batchCount == 0) return GA_NO_ERROR;                              \
+  ctx = gpudata_context(l[0]);                                          \
+  if (ctx->blas_ops->name)                                              \
+    return ctx->blas_ops->name args;                                    \
+  else                                                                  \
+    return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name)
+
+#define BLAS_OPBF(l, name, args)                                        \
+  gpucontext *ctx;                                                      \
+  if (batchCount == 0) return GA_NO_ERROR;                              \
+  ctx = gpudata_context(l[0]);                                          \
+  if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \
+  if (ctx->blas_ops->name)                                              \
+    return ctx->blas_ops->name args;                                    \
+  else                                                                  \
+    return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name)
+
 int gpublas_hgemmBatch(
-  cb_order order, cb_transpose transA, cb_transpose transB,
-  size_t M, size_t N, size_t K, float alpha,
-  gpudata **A, size_t *offA, size_t lda,
-  gpudata **B, size_t *offB, size_t ldb,
-  float beta, gpudata **C, size_t *offC, size_t ldc,
-  size_t batchCount, int flags) {
-  if (flags != 0) return GA_INVALID_ERROR;
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(A[0])->blas_ops->hgemmBatch(
-    order, transA, transB, M, N, K, alpha, A, offA, lda,
-    B, offB, ldb, beta, C, offC, ldc, batchCount);
+    cb_order order, cb_transpose transA, cb_transpose transB,
+    size_t M, size_t N, size_t K, float alpha,
+    gpudata **A, size_t *offA, size_t lda,
+    gpudata **B, size_t *offB, size_t ldb,
+    float beta, gpudata **C, size_t *offC, size_t ldc,
+    size_t batchCount, int flags) {
+  BLAS_OPBF(A, hgemmBatch,
+            (order, transA, transB, M, N, K, alpha, A, offA, lda,
+             B, offB, ldb, beta, C, offC, ldc, batchCount));
 }
 
 int gpublas_sgemmBatch(
@@ -127,11 +176,9 @@ int gpublas_sgemmBatch(
   gpudata **B, size_t *offB, size_t ldb,
   float beta, gpudata **C, size_t *offC, size_t ldc,
   size_t batchCount, int flags) {
-  if (flags != 0) return GA_INVALID_ERROR;
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(A[0])->blas_ops->sgemmBatch(
-    order, transA, transB, M, N, K, alpha, A, offA, lda,
-    B, offB, ldb, beta, C, offC, ldc, batchCount);
+  BLAS_OPBF(A, sgemmBatch,
+            (order, transA, transB, M, N, K, alpha, A, offA, lda,
+             B, offB, ldb, beta, C, offC, ldc, batchCount));
 }
 
 int gpublas_dgemmBatch(
@@ -141,11 +188,9 @@ int gpublas_dgemmBatch(
   gpudata **B, size_t *offB, size_t ldb,
   double beta, gpudata **C, size_t *offC, size_t ldc,
   size_t batchCount, int flags) {
-  if (flags != 0) return GA_INVALID_ERROR;
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(A[0])->blas_ops->dgemmBatch(
-    order, transA, transB, M, N, K, alpha, A, offA, lda,
-    B, offB, ldb, beta, C, offC, ldc, batchCount);
+  BLAS_OPBF(A, dgemmBatch,
+            (order, transA, transB, M, N, K, alpha, A, offA, lda,
+             B, offB, ldb, beta, C, offC, ldc, batchCount));
 }
 
 int gpublas_hgemvBatch(
@@ -155,10 +200,9 @@ int gpublas_hgemvBatch(
   gpudata **x, size_t *offX, size_t incX,
   float beta, gpudata **y, size_t *offY, size_t incY,
   size_t batchCount, int flags) {
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(A[0])->blas_ops->hgemvBatch(
-    order, transA, M, N, alpha, A, offA, lda, x, offX, incX,
-    beta, y, offY, incY, batchCount, flags);
+  BLAS_OPB(A, hgemvBatch,
+           (order, transA, M, N, alpha, A, offA, lda, x, offX, incX,
+            beta, y, offY, incY, batchCount, flags));
 }
 
 int gpublas_sgemvBatch(
@@ -168,10 +212,9 @@ int gpublas_sgemvBatch(
   gpudata **x, size_t *offX, size_t incX,
   float beta, gpudata **y, size_t *offY, size_t incY,
   size_t batchCount, int flags) {
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(A[0])->blas_ops->sgemvBatch(
-    order, transA, M, N, alpha, A, offA, lda, x, offX, incX,
-    beta, y, offY, incY, batchCount, flags);
+  BLAS_OPB(A, sgemvBatch,
+           (order, transA, M, N, alpha, A, offA, lda, x, offX, incX,
+            beta, y, offY, incY, batchCount, flags));
 }
 
 int gpublas_dgemvBatch(
@@ -181,10 +224,9 @@ int gpublas_dgemvBatch(
   gpudata **x, size_t *offX, size_t incX,
   double beta, gpudata **y, size_t *offY, size_t incY,
   size_t batchCount, int flags) {
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(A[0])->blas_ops->dgemvBatch(
-    order, transA, M, N, alpha, A, offA, lda, x, offX, incX,
-    beta, y, offY, incY, batchCount, flags);
+  BLAS_OPB(A, dgemvBatch,
+           (order, transA, M, N, alpha, A, offA, lda, x, offX, incX,
+            beta, y, offY, incY, batchCount, flags));
 }
 
 int gpublas_hgerBatch(cb_order order, size_t M, size_t N, float alpha,
@@ -192,10 +234,9 @@ int gpublas_hgerBatch(cb_order order, size_t M, size_t N, float alpha,
                       gpudata **y, size_t *offY, size_t incY,
                       gpudata **A, size_t *offA, size_t lda,
                       size_t batchCount, int flags) {
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(x[0])->blas_ops->hgerBatch(
-    order, M, N, alpha, x, offX, incX, y, offY, incY,
-    A, offA, lda, batchCount, flags);
+  BLAS_OPB(x, hgerBatch,
+           (order, M, N, alpha, x, offX, incX, y, offY, incY,
+            A, offA, lda, batchCount, flags));
 }
 
 int gpublas_sgerBatch(cb_order order, size_t M, size_t N, float alpha,
@@ -203,10 +244,9 @@ int gpublas_sgerBatch(cb_order order, size_t M, size_t N, float alpha,
                       gpudata **y, size_t *offY, size_t incY,
                       gpudata **A, size_t *offA, size_t lda,
                       size_t batchCount, int flags) {
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(x[0])->blas_ops->sgerBatch(
-    order, M, N, alpha, x, offX, incX, y, offY, incY,
-    A, offA, lda, batchCount, flags);
+  BLAS_OPB(x, sgerBatch,
+           (order, M, N, alpha, x, offX, incX, y, offY, incY,
+            A, offA, lda, batchCount, flags));
 }
 
 int gpublas_dgerBatch(cb_order order, size_t M, size_t N, double alpha,
@@ -214,8 +254,54 @@ int gpublas_dgerBatch(cb_order order, size_t M, size_t N, double alpha,
                       gpudata **y, size_t *offY, size_t incY,
                       gpudata **A, size_t *offA, size_t lda,
                       size_t batchCount, int flags) {
-  if (batchCount == 0) return GA_NO_ERROR;
-  return gpudata_context(x[0])->blas_ops->dgerBatch(
-    order, M, N, alpha, x, offX, incX, y, offY, incY,
-    A, offA, lda, batchCount, flags);
+  BLAS_OPB(x, dgerBatch,
+           (order, M, N, alpha, x, offX, incX, y, offY, incY,
+            A, offA, lda, batchCount, flags));
+}
+
+
+#define BLAS_OP3F(b, name, args)                                        \
+  gpucontext *ctx;                                                      \
+  if (batchCount == 0) return GA_NO_ERROR;                              \
+  ctx = gpudata_context(b);                                             \
+  if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \
+  if (ctx->blas_ops->name)                                              \
+    return ctx->blas_ops->name args;                                    \
+  else                                                                  \
+    return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name)
+
+int gpublas_hgemm3D(
+    cb_order order, cb_transpose transA, cb_transpose transB,
+    size_t M, size_t N, size_t K, float alpha,
+    gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+    gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+    float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+    size_t batchCount, int flags) {
+  BLAS_OP3F(A, hgemm3D,
+            (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA,
+             B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount));
+}
+
+int gpublas_sgemm3D(
+    cb_order order, cb_transpose transA, cb_transpose transB,
+    size_t M, size_t N, size_t K, float alpha,
+    gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+    gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+    float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+    size_t batchCount, int flags) {
+  BLAS_OP3F(A, sgemm3D,
+            (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA,
+             B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount));
+}
+
+int gpublas_dgemm3D(
+    cb_order order, cb_transpose transA, cb_transpose transB,
+    size_t M, size_t N, size_t K, double alpha,
+    gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+    gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+    double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+    size_t batchCount, int flags) {
+  BLAS_OP3F(A, dgemm3D,
+            (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA,
+             B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount));
 }
diff --git a/src/gpuarray_buffer_collectives.c b/src/gpuarray_buffer_collectives.c
index 3a55c4fcfb..803055bef3 100644
--- a/src/gpuarray_buffer_collectives.c
+++ b/src/gpuarray_buffer_collectives.c
@@ -8,22 +8,21 @@ int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id,
                 int ndev, int rank) {
   if (ctx->comm_ops == NULL) {
     *comm = NULL;
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Collectives unavailable");
   }
   return ctx->comm_ops->comm_new(comm, ctx, comm_id, ndev, rank);
 }
 
 void gpucomm_free(gpucomm* comm) {
-  gpucontext* ctx = gpucomm_context(comm);
+  gpucontext* ctx;
+  if (comm == NULL) return;
+  ctx = gpucomm_context(comm);
   if (ctx->comm_ops != NULL)
     ctx->comm_ops->comm_free(comm);
 }
 
 const char* gpucomm_error(gpucontext* ctx) {
-  if (ctx->comm_ops != NULL)
-    return ctx->error_msg;
-  return "No collective ops available, API error. Is a collectives library "
-         "installed?";
+  return ctx->err->msg;
 }
 
 gpucontext* gpucomm_context(gpucomm* comm) {
@@ -31,21 +30,21 @@ gpucontext* gpucomm_context(gpucomm* comm) {
 }
 int gpucomm_gen_clique_id(gpucontext* ctx, gpucommCliqueId* comm_id) {
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->generate_clique_id(ctx, comm_id);
 }
 
 int gpucomm_get_count(gpucomm* comm, int* gpucount) {
   gpucontext* ctx = gpucomm_context(comm);
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->get_count(comm, gpucount);
 }
 
 int gpucomm_get_rank(gpucomm* comm, int* rank) {
   gpucontext* ctx = gpucomm_context(comm);
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->get_rank(comm, rank);
 }
 
@@ -54,7 +53,7 @@ int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest,
                    gpucomm* comm) {
   gpucontext* ctx = gpucomm_context(comm);
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->reduce(src, offsrc, dest, offdest, count, typecode,
                                opcode, root, comm);
 }
@@ -64,7 +63,7 @@ int gpucomm_all_reduce(gpudata* src, size_t offsrc, gpudata* dest,
                        gpucomm* comm) {
   gpucontext* ctx = gpucomm_context(comm);
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->all_reduce(src, offsrc, dest, offdest, count, typecode,
                                    opcode, comm);
 }
@@ -74,7 +73,7 @@ int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest,
                            int opcode, gpucomm* comm) {
   gpucontext* ctx = gpucomm_context(comm);
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->reduce_scatter(src, offsrc, dest, offdest, count,
                                        typecode, opcode, comm);
 }
@@ -83,7 +82,7 @@ int gpucomm_broadcast(gpudata* array, size_t offset, size_t count, int typecode,
                       int root, gpucomm* comm) {
   gpucontext* ctx = gpucomm_context(comm);
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->broadcast(array, offset, count, typecode, root, comm);
 }
 
@@ -92,7 +91,7 @@ int gpucomm_all_gather(gpudata* src, size_t offsrc, gpudata* dest,
                        gpucomm* comm) {
   gpucontext* ctx = gpucomm_context(comm);
   if (ctx->comm_ops == NULL)
-    return GA_COMM_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable");
   return ctx->comm_ops->all_gather(src, offsrc, dest, offdest, count, typecode,
                                    comm);
 }
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index cfbc1e672f..772e9757f0 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -3,6 +3,9 @@
 #include "private.h"
 #include "private_cuda.h"
 
+#include "loaders/libnvrtc.h"
+#include "loaders/libcublas.h"
+
 #include <sys/types.h>
 
 #include <assert.h>
@@ -16,39 +19,185 @@
 #include "gpuarray/buffer.h"
 #include "gpuarray/util.h"
 #include "gpuarray/error.h"
-#include "gpuarray/extension.h"
 #include "gpuarray/buffer_blas.h"
 
+#include "gpuarray/extension.h"
+
+#include "cluda_cuda.h.c"
+
+STATIC_ASSERT(DONTFREE == GPUARRAY_CUDA_CTX_NOFREE, cuda_nofree_eq);
+STATIC_ASSERT(CUDA_WAIT_READ == GPUARRAY_CUDA_WAIT_READ, cuda_wait_read_eq);
+STATIC_ASSERT(CUDA_WAIT_WRITE == GPUARRAY_CUDA_WAIT_WRITE, cuda_wait_write_eq);
+STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcmem_eq);
+
 /* Allocations will be made in blocks of at least this size */
 #define BLOCK_SIZE (4 * 1024 * 1024)
 
-/* No returned allocations will be smaller than this size.
-   Also, they will be aligned to this size. */
+/* No returned allocations will be smaller than this size.  Also, they
+ * will be aligned to this size.
+ *
+ * Some libraries depend on this value and will crash if it's smaller.
+ */
 #define FRAG_SIZE (64)
 
-static CUresult err;
-static int init_done = 0;
+extern gpuarray_blas_ops cublas_ops;
+extern gpuarray_comm_ops nccl_ops;
 
-GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops;
+const gpuarray_buffer_ops cuda_ops;
 
 static void cuda_freekernel(gpukernel *);
 static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *);
 static int cuda_waits(gpudata *, int, CUstream);
 static int cuda_records(gpudata *, int, CUstream);
+static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags);
+static void cuda_free(gpudata *);
 
-static int detect_arch(const char *prefix, char *ret, CUresult *err);
+static int detect_arch(const char *prefix, char *ret, error *e);
 static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size);
 
-static int strb_eq(void *_k1, void *_k2) {
-  strb *k1 = (strb *)_k1;
-  strb *k2 = (strb *)_k2;
+typedef struct _disk_key {
+  uint8_t version;
+  uint8_t debug;
+  uint8_t major;
+  uint8_t minor;
+  uint32_t reserved;
+  char bin_id[64];
+  strb src;
+} disk_key;
+
+typedef struct _kernel_key {
+  const char *fname;
+  strb src;
+} kernel_key;
+
+/* Size of the disk_key that we can memcopy to duplicate */
+#define DISK_KEY_MM (sizeof(disk_key) - sizeof(strb))
+
+static void disk_free(cache_key_t _k) {
+  disk_key *k = (disk_key *)_k;
+  strb_clear(&k->src);
+  free(k);
+}
+
+static int strb_eq(strb *k1, strb *k2) {
   return (k1->l == k2->l &&
           memcmp(k1->s, k2->s, k1->l) == 0);
 }
 
-static uint32_t strb_hash(void *_k) {
-  strb *k = (strb *)_k;
-  return XXH32(k->s, k->l, 42);
+static int kernel_eq(kernel_key *k1, kernel_key *k2) {
+  return (strcmp(k1->fname, k2->fname) == 0 &&
+          strb_eq(&k1->src, &k2->src));
+}
+
+static uint32_t kernel_hash(kernel_key *k) {
+  XXH32_state_t state;
+  XXH32_reset(&state, 42);
+  XXH32_update(&state, k->fname, strlen(k->fname));
+  XXH32_update(&state, k->src.s, k->src.l);
+  return XXH32_digest(&state);
+}
+
+static void kernel_free(kernel_key *k) {
+  free((void *)k->fname);
+  strb_clear(&k->src);
+  free(k);
+}
+
+static int disk_eq(disk_key *k1, disk_key *k2) {
+  return (memcmp(k1, k2, DISK_KEY_MM) == 0 &&
+          strb_eq(&k1->src, &k2->src));
+}
+
+static int disk_hash(disk_key *k) {
+  XXH32_state_t state;
+  XXH32_reset(&state, 42);
+  XXH32_update(&state, k, DISK_KEY_MM);
+  XXH32_update(&state, k->src.s, k->src.l);
+  return XXH32_digest(&state);
+}
+
+static int disk_write(strb *res, disk_key *k) {
+  strb_appendn(res, (const char *)k, DISK_KEY_MM);
+  strb_appendb(res, &k->src);
+  return strb_error(res);
+}
+
+static disk_key *disk_read(const strb *b) {
+  disk_key *k;
+  if (b->l < DISK_KEY_MM) return NULL;
+  k = calloc(1, sizeof(*k));
+  if (k == NULL) return NULL;
+  memcpy(k, b->s, DISK_KEY_MM);
+  if (k->version != 0) {
+    free(k);
+    return NULL;
+  }
+  if (strb_ensure(&k->src, b->l - DISK_KEY_MM) != 0) {
+    strb_clear(&k->src);
+    free(k);
+    return NULL;
+  }
+  strb_appendn(&k->src, b->s + DISK_KEY_MM, b->l - DISK_KEY_MM);
+  return k;
+}
+
+static int kernel_write(strb *res, strb *bin) {
+  strb_appendb(res, bin);
+  return strb_error(res);
+}
+
+static strb *kernel_read(const strb *b) {
+  strb *res = strb_alloc(b->l);
+  if (res != NULL)
+    strb_appendb(res, b);
+  return res;
+}
+
+static int setup_done = 0;
+static int major = -1;
+static int minor = -1;
+static int setup_lib(void) {
+  CUresult err;
+  int res, tmp;
+  int orig_major, orig_minor;
+
+  if (!setup_done) {
+    res = load_libcuda(global_err);
+    if (res != GA_NO_ERROR)
+      return res;
+    err = cuInit(0);
+    if (err != CUDA_SUCCESS)
+      return error_cuda(global_err, "cuInit", err);
+    err = cuDriverGetVersion(&tmp);
+    if (err != CUDA_SUCCESS)
+      return error_set(global_err, GA_IMPL_ERROR, "cuDriverGetVersion failed");
+    major = tmp / 1000;
+    minor = (tmp / 10) % 10;
+    /* Let's try to load a nvrtc corresponding to detected CUDA version. */
+    res = load_libnvrtc(major, minor, global_err);
+    orig_major = major;
+    orig_minor = minor;
+    if (res != GA_NO_ERROR) {
+      /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */
+      int versions[][2] = {{10, 0}, {9, 2}, {9, 1}, {9, 0}, {8, 0}, {7, 5}, {7, 0}};
+      int versions_length = sizeof(versions) / sizeof(versions[0]);
+      int i = 0;
+      /* Skip versions that are higher or equal to the driver version */
+      while (versions[i][0] > major ||
+             (versions[i][0] == major && versions[i][1] >= minor)) i++;
+      do {
+        major = versions[i][0];
+        minor = versions[i][1];
+        res = load_libnvrtc(major, minor, global_err);
+        i++;
+      } while (res != GA_NO_ERROR && i < versions_length);
+    }
+    if (res != GA_NO_ERROR)
+      // Return the error from the original attempt
+      return load_libnvrtc(orig_major, orig_minor, global_err);
+    setup_done = 1;
+  }
+  return GA_NO_ERROR;
 }
 
 static int cuda_get_platform_count(unsigned int* platcount) {
@@ -58,42 +207,73 @@ static int cuda_get_platform_count(unsigned int* platcount) {
 
 static int cuda_get_device_count(unsigned int platform,
                                  unsigned int* devcount) {
+  CUresult err;
   int dv;
   // platform number gets ignored in CUDA implementation
-  if (!init_done) {
-    err = cuInit(0);
-    if (err != CUDA_SUCCESS)
-      return GA_IMPL_ERROR;
-    init_done = 1;
-  }
+  GA_CHECK(setup_lib());
   err = cuDeviceGetCount(&dv);
   if (err != CUDA_SUCCESS)
-    return GA_IMPL_ERROR;
+    return error_cuda(global_err, "cuDeviceGetCount", err);
   *devcount = (unsigned int)dv;
   return GA_NO_ERROR;
 }
 
-cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
+cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p) {
   cuda_context *res;
-  void *p;
+  cache *mem_cache;
+  const char *cache_path;
+  void *pp;
+  CUdevice dev;
+  CUresult err;
+  int cc_major, cc_minor;
+  int e;
+
+  e = setup_lib();
+  if (e != GA_NO_ERROR)
+    return NULL;
+
+  err = cuCtxGetDevice(&dev);
+  if (err != CUDA_SUCCESS) {
+    error_cuda(global_err, "cuCtxGetDevice", err);
+    return NULL;
+  }
+
+  e = get_cc(dev, &cc_major, &cc_minor, global_err);
+  if (e != GA_NO_ERROR)
+    return NULL;
+
+  if ((major >= 9 && cc_major <= 2) || (major >= 7 && cc_major <= 1)) {
+    error_set(global_err, GA_UNSUPPORTED_ERROR,
+              "GPU is too old for CUDA version");
+    return NULL;
+  }
 
   res = calloc(1, sizeof(*res));
-  if (res == NULL)
+  if (res == NULL) {
+    error_sys(global_err, "calloc");
     return NULL;
+  }
   res->ctx = ctx;
   res->ops = &cuda_ops;
-  res->err = CUDA_SUCCESS;
   res->refcnt = 1;
-  res->flags = flags;
+  res->flags = p->flags;
+  res->max_cache_size = p->max_cache_size;
   res->enter = 0;
+  res->major = major;
+  res->minor = minor;
   res->freeblocks = NULL;
-  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
+  if (error_alloc(&res->err)) {
+    error_set(global_err, GA_SYS_ERROR, "Could not create error context");
+    goto fail_errmsg;
+  }
+  if (detect_arch(ARCH_PREFIX, res->bin_id, global_err)) {
     goto fail_stream;
   }
   /* Don't add the nonblocking flags to help usage with other
      libraries that may do stuff on the NULL stream */
   err = cuStreamCreate(&res->s, 0);
   if (err != CUDA_SUCCESS) {
+    error_cuda(global_err, "cuStreamCreate", err);
     goto fail_stream;
   }
   if (ISSET(res->flags, GA_CTX_SINGLE_STREAM)) {
@@ -103,31 +283,80 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
        libraries that may do stuff on the NULL stream */
     err = cuStreamCreate(&res->mem_s, 0);
     if (err != CUDA_SUCCESS) {
+      error_cuda(global_err, "cuStreamCreate", err);
       goto fail_mem_stream;
     }
   }
-  res->kernel_cache = cache_twoq(64, 128, 64, 8, strb_eq, strb_hash,
-                                 (cache_freek_fn)strb_free,
-                                 (cache_freev_fn)cuda_freekernel);
-  if (res->kernel_cache == NULL)
+
+  res->kernel_cache = cache_twoq(64, 128, 64, 8,
+                                 (cache_eq_fn)kernel_eq,
+                                 (cache_hash_fn)kernel_hash,
+                                 (cache_freek_fn)kernel_free,
+                                 (cache_freev_fn)cuda_freekernel, global_err);
+  if (res->kernel_cache == NULL) {
+    error_cuda(global_err, "cuStreamCreate", err);
     goto fail_cache;
-  err = cuMemAllocHost(&p, 16);
+  }
+
+  cache_path = p->kernel_cache_path;
+  if (cache_path == NULL)
+    cache_path = getenv("GPUARRAY_CACHE_PATH");
+  if (cache_path != NULL) {
+    mem_cache = cache_lru(64, 8,
+                          (cache_eq_fn)disk_eq,
+                          (cache_hash_fn)disk_hash,
+                          (cache_freek_fn)disk_free,
+                          (cache_freev_fn)strb_free,
+                          global_err);
+    if (mem_cache == NULL) {
+      fprintf(stderr, "Error initializing mem cache for disk: %s\n",
+              global_err->msg);
+      goto fail_disk_cache;
+    }
+    res->disk_cache = cache_disk(cache_path, mem_cache,
+                                 (kwrite_fn)disk_write,
+                                 (vwrite_fn)kernel_write,
+                                 (kread_fn)disk_read,
+                                 (vread_fn)kernel_read,
+                                 global_err);
+    if (res->disk_cache == NULL) {
+      fprintf(stderr, "Error initializing disk cache, disabling: %s\n",
+              global_err->msg);
+      cache_destroy(mem_cache);
+      goto fail_disk_cache;
+    }
+  } else {
+  fail_disk_cache:
+    res->disk_cache = NULL;
+  }
+
+  err = cuMemAllocHost(&pp, 16);
   if (err != CUDA_SUCCESS) {
+    error_cuda(global_err, "cuMemAllocHost", err);
     goto fail_errbuf;
   }
-  memset(p, 0, 16);
+  memset(pp, 0, 16);
   /* Need to tag for new_gpudata */
   TAG_CTX(res);
-  res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16);
+  res->errbuf = new_gpudata(res, (CUdeviceptr)pp, 16);
   if (res->errbuf == NULL) {
-    err = res->err;
+    /* Copy the error from the context since we are getting rid of it */
+    error_set(global_err, res->err->code, res->err->msg);
     goto fail_end;
   }
   res->errbuf->flags |= CUDA_MAPPED_PTR;
+  /* Prime the cache */
+  if (p->initial_cache_size) {
+    gpudata *tmp = cuda_alloc((gpucontext *)res, p->initial_cache_size, NULL, 0);
+    if (tmp != NULL)
+      cuda_free(tmp);
+  }
   return res;
  fail_end:
-  cuMemFreeHost(p);
+  cuMemFreeHost(pp);
  fail_errbuf:
+  if (res->disk_cache)
+    cache_destroy(res->disk_cache);
   cache_destroy(res->kernel_cache);
  fail_cache:
   if (ISCLR(res->flags, GA_CTX_SINGLE_STREAM))
@@ -135,6 +364,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
  fail_mem_stream:
   cuStreamDestroy(res->s);
  fail_stream:
+  error_free(res->err);
+ fail_errmsg:
   free(res);
   return NULL;
 }
@@ -142,20 +373,15 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
 static void deallocate(gpudata *);
 
 static void cuda_free_ctx(cuda_context *ctx) {
-  gpuarray_blas_ops *blas_ops;
   gpudata *next, *curr;
-#if CUDA_VERSION >= 7000
   CUdevice dev;
-#endif
 
   ASSERT_CTX(ctx);
   ctx->refcnt--;
   if (ctx->refcnt == 0) {
     assert(ctx->enter == 0 && "Context was active when freed!");
     if (ctx->blas_handle != NULL) {
-      ctx->err = cuda_property((gpucontext *)ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS,
-                               &blas_ops);
-      blas_ops->teardown((gpucontext *)ctx);
+      ctx->blas_ops->teardown((gpucontext *)ctx);
     }
     cuMemFreeHost((void *)ctx->errbuf->ptr);
     deallocate(ctx->errbuf);
@@ -171,16 +397,15 @@ static void cuda_free_ctx(cuda_context *ctx) {
       deallocate(curr);
     }
     cache_destroy(ctx->kernel_cache);
+    if (ctx->disk_cache)
+      cache_destroy(ctx->disk_cache);
+    error_free(ctx->err);
 
     if (!(ctx->flags & DONTFREE)) {
-#if CUDA_VERSION < 7000
-      cuCtxDestroy(ctx->ctx);
-#else
       cuCtxPushCurrent(ctx->ctx);
       cuCtxGetDevice(&dev);
       cuCtxPopCurrent(NULL);
       cuDevicePrimaryCtxRelease(dev);
-#endif
     }
     CLEAR(ctx);
     free(ctx);
@@ -209,10 +434,14 @@ void cuda_exit(cuda_context *ctx) {
 
 static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) {
   gpudata *res;
+  CUresult err;
   int fl = CU_EVENT_DISABLE_TIMING;
 
   res = malloc(sizeof(*res));
-  if (res == NULL) return NULL;
+  if (res == NULL) {
+    error_sys(ctx->err, "malloc");
+    return NULL;
+  }
 
   res->refcnt = 0;
   res->sz = size;
@@ -224,15 +453,17 @@ static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) {
 
   if (ctx->flags & GA_CTX_MULTI_THREAD)
     fl |= CU_EVENT_BLOCKING_SYNC;
-  ctx->err = cuEventCreate(&res->rev, fl);
-  if (ctx->err != CUDA_SUCCESS) {
+  err = cuEventCreate(&res->rev, fl);
+  if (err != CUDA_SUCCESS) {
+    error_cuda(ctx->err, "cuEventCreate", err);
     cuda_exit(ctx);
     free(res);
     return NULL;
   }
 
-  ctx->err = cuEventCreate(&res->wev, fl);
-  if (ctx->err != CUDA_SUCCESS) {
+  err = cuEventCreate(&res->wev, fl);
+  if (err != CUDA_SUCCESS) {
+    error_cuda(ctx->err, "cuEventCreate", err);
     cuEventDestroy(res->rev);
     cuda_exit(ctx);
     free(res);
@@ -263,135 +494,108 @@ gpudata *cuda_make_buf(cuda_context *ctx, CUdeviceptr p, size_t sz) {
 
 size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; }
 
-#define FAIL(v, e) { if (ret) *ret = e; return v; }
-#define CHKFAIL(v) if (err != CUDA_SUCCESS) FAIL(v, GA_IMPL_ERROR)
-
-static const char CUDA_PREAMBLE[] =
-    "#define local_barrier() __syncthreads()\n"
-    "#define WITHIN_KERNEL extern \"C\" __device__\n"
-    "#define KERNEL extern \"C\" __global__\n"
-    "#define GLOBAL_MEM /* empty */\n"
-    "#define LOCAL_MEM __shared__\n"
-    "#define LOCAL_MEM_ARG /* empty */\n"
-    "#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y, Z)\n"
-    "#ifdef NAN\n"
-    "#undef NAN\n"
-    "#endif\n"
-    "#define NAN __int_as_float(0x7fffffff)\n"
-    "#define LID_0 threadIdx.x\n"
-    "#define LID_1 threadIdx.y\n"
-    "#define LID_2 threadIdx.z\n"
-    "#define LDIM_0 blockDim.x\n"
-    "#define LDIM_1 blockDim.y\n"
-    "#define LDIM_2 blockDim.z\n"
-    "#define GID_0 blockIdx.x\n"
-    "#define GID_1 blockIdx.y\n"
-    "#define GID_2 blockIdx.z\n"
-    "#define GDIM_0 gridDim.x\n"
-    "#define GDIM_1 gridDim.y\n"
-    "#define GDIM_2 gridDim.z\n"
-    "#define ga_bool unsigned char\n"
-    "#define ga_byte signed char\n"
-    "#define ga_ubyte unsigned char\n"
-    "#define ga_short short\n"
-    "#define ga_ushort unsigned short\n"
-    "#define ga_int int\n"
-    "#define ga_uint unsigned int\n"
-    "#define ga_long long long\n"
-    "#define ga_ulong unsigned long long\n"
-    "#define ga_float float\n"
-    "#define ga_double double\n"
-    "#define ga_half ga_ushort\n"
-    "#define ga_size size_t\n"
-    "#define ga_ssize ptrdiff_t\n"
-    "#define load_half(p) __half2float(*(p))\n"
-    "#define store_half(p, v) (*(p) = __float2half_rn(v))\n"
-    "#define GA_DECL_SHARED_PARAM(type, name)\n"
-    "#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n"
-    "#define GA_WARP_SIZE warpSize\n";
-
-/* XXX: add complex, quads, longlong */
-/* XXX: add vector types */
-
-static cuda_context *do_init(CUdevice dev, int flags, int *ret) {
-    cuda_context *res;
-    CUcontext ctx;
-    unsigned int fl = CU_CTX_SCHED_AUTO;
-#if CUDA_VERSION >= 7000
-    unsigned int cur_fl;
-    int act;
-#endif
-    int i;
+#define CHKFAIL(e, n, v)      \
+  if (err != CUDA_SUCCESS) { \
+    error_cuda(e, n, err);   \
+    return v;                \
+  }
 
-    CHKFAIL(NULL);
-    if (flags & GA_CTX_SINGLE_THREAD)
-      fl = CU_CTX_SCHED_SPIN;
-    if (flags & GA_CTX_MULTI_THREAD)
-      fl = CU_CTX_SCHED_YIELD;
-    err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
-    CHKFAIL(NULL);
-    if (i != 1)
-      FAIL(NULL, GA_UNSUPPORTED_ERROR);
-#if CUDA_VERSION < 7000
-    err = cuCtxCreate(&ctx, fl, dev);
-    CHKFAIL(NULL);
-#else
-    err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act);
-    CHKFAIL(NULL);
-    if (act == 1) {
-      if ((cur_fl & fl) != fl)
-        FAIL(NULL, GA_INVALID_ERROR);
-    } else {
-      err = cuDevicePrimaryCtxSetFlags(dev, fl);
-      CHKFAIL(NULL);
-    }
-    err = cuDevicePrimaryCtxRetain(&ctx, dev);
-    CHKFAIL(NULL);
-    err = cuCtxPushCurrent(ctx);
-    CHKFAIL(NULL);
-#endif
-    res = cuda_make_ctx(ctx, flags);
-    if (res == NULL) {
-#if CUDA_VERSION < 7000
-      cuCtxDestroy(ctx);
-#else
-      cuDevicePrimaryCtxRelease(dev);
-#endif
-      FAIL(NULL, GA_IMPL_ERROR);
+static cuda_context *do_init(CUdevice dev, gpucontext_props *p, error *e) {
+  cuda_context *res;
+  CUcontext ctx;
+  CUresult err;
+  unsigned int fl = 0;
+  unsigned int cur_fl;
+  int act;
+  int i;
+
+  switch (p->sched) {
+  case GA_CTX_SCHED_AUTO:
+    fl = CU_CTX_SCHED_AUTO;
+    break;
+  case GA_CTX_SCHED_SINGLE:
+    fl = CU_CTX_SCHED_SPIN;
+    break;
+  case GA_CTX_SCHED_MULTI:
+    fl = CU_CTX_SCHED_BLOCKING_SYNC;
+    break;
+  }
+  err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
+  CHKFAIL(e, "cuDeviceGetAttribute", NULL);
+  if (i != 1) {
+    error_set(e, GA_UNSUPPORTED_ERROR, "device does not support unified addressing");
+    return NULL;
+  }
+  err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act);
+  CHKFAIL(e, "cuDevicePrimaryCtxGetState", NULL);
+  if (act == 1) {
+    if ((cur_fl & fl) != fl) {
+      error_set(e, GA_INVALID_ERROR, "device is already active and has unsupported flags");
+      return NULL;
     }
-    /* Don't leave the context on the thread stack */
-    cuCtxPopCurrent(NULL);
+  } else {
+    err = cuDevicePrimaryCtxSetFlags(dev, fl);
+    CHKFAIL(e, "cuDevicePrimaryCtxSetFlags", NULL);
+  }
+  err = cuDevicePrimaryCtxRetain(&ctx, dev);
+  CHKFAIL(e, "cuDevicePrimaryCtxRetain", NULL);
+  err = cuCtxPushCurrent(ctx);
+  CHKFAIL(e, "cuCtxPushCurrent", NULL);
+  res = cuda_make_ctx(ctx, p);
+  if (res == NULL) {
+    cuDevicePrimaryCtxRelease(dev);
+    if (e != global_err)
+      error_set(e, global_err->code, global_err->msg);
+    return NULL;
+  }
 
-    return res;
+  res->blas_handle = NULL;
+  /* If we can't load cublas, then we have no blas */
+  if (!load_libcublas(major, minor, res->err)) {
+    res->blas_ops = &cublas_ops;
+  } else {
+    res->blas_ops = NULL;
+  }
+
+  res->comm_ops = &nccl_ops;
+
+  /* Don't leave the context on the thread stack */
+  cuCtxPopCurrent(NULL);
+
+  return res;
 }
-static gpucontext *cuda_init(int ord, int flags, int *ret) {
+
+static gpucontext *cuda_init(gpucontext_props *p) {
     CUdevice dev;
     cuda_context *res;
+    CUresult err;
+    int r;
 
-    if (!init_done) {
-      err = cuInit(0);
-      CHKFAIL(NULL);
-      init_done = 1;
+    r = setup_lib();
+    if (r != GA_NO_ERROR) {
+      return NULL;
     }
 
-    if (ord == -1) {
+    if (p->dev == -1) {
       int i, c;
       err = cuDeviceGetCount(&c);
-      CHKFAIL(NULL);
+      CHKFAIL(global_err, "cuDeviceGetCount", NULL);
       for (i = 0; i < c; i++) {
         err = cuDeviceGet(&dev, i);
-        CHKFAIL(NULL);
-        res = do_init(dev, flags, NULL);
+        CHKFAIL(global_err, "cuDeviceGet", NULL);
+        res = do_init(dev, p, global_err);
         if (res != NULL)
           return (gpucontext *)res;
       }
-      FAIL(NULL, GA_NODEV_ERROR);
+      error_set(global_err, GA_NODEV_ERROR, "No cuda device available");
+      return NULL;
     } else {
-      err = cuDeviceGet(&dev, ord);
-      CHKFAIL(NULL);
-      return (gpucontext *)do_init(dev, flags, ret);
+      err = cuDeviceGet(&dev, p->dev);
+      CHKFAIL(global_err, "cuDeviceGet", NULL);
+      return (gpucontext *)do_init(dev, p, global_err);
     }
 }
+
 static void cuda_deinit(gpucontext *c) {
   cuda_free_ctx((cuda_context *)c);
 }
@@ -414,6 +618,21 @@ static void find_best(cuda_context *ctx, gpudata **best, gpudata **prev,
   }
 }
 
+static size_t largest_size(cuda_context *ctx) {
+  gpudata *temp;
+  size_t sz, dummy;
+  cuda_enter(ctx);
+  cuMemGetInfo(&sz, &dummy);
+  cuda_exit(ctx);
+   /* We guess that we can allocate at least a quarter of the free size
+     in a single block. This might be wrong though. */
+  sz /= 4;
+  for (temp = ctx->freeblocks; temp; temp = temp->next) {
+    if (temp->sz > sz) sz = temp->sz;
+  }
+  return sz;
+}
+
 /*
  * Allocate a new block and place in on the freelist. Will allocate
  * the bigger of the requested size and BLOCK_SIZE to avoid allocating
@@ -423,17 +642,22 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev,
                     size_t size) {
   CUdeviceptr ptr;
   gpudata *next;
+  CUresult err;
+
   *prev = NULL;
 
-  if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE))
+  if (ctx->max_cache_size != 0) {
     if (size < BLOCK_SIZE) size = BLOCK_SIZE;
+    if (ctx->cache_size + size > ctx->max_cache_size)
+      return error_set(ctx->err, GA_VALUE_ERROR, "Maximum cache size reached");
+  }
 
   cuda_enter(ctx);
 
-  ctx->err = cuMemAlloc(&ptr, size);
-  if (ctx->err != CUDA_SUCCESS) {
+  err = cuMemAlloc(&ptr, size);
+  if (err != CUDA_SUCCESS) {
     cuda_exit(ctx);
-    return GA_IMPL_ERROR;
+    return error_cuda(ctx->err, "cuMemAlloc", err);
   }
 
   *res = new_gpudata(ctx, ptr, size);
@@ -442,9 +666,11 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev,
 
   if (*res == NULL) {
     cuMemFree(ptr);
-    return GA_MEMORY_ERROR;
+    return ctx->err->code;
   }
 
+  ctx->cache_size += size;
+
   (*res)->flags |= CUDA_HEAD_ALLOC;
 
   /* Now that the block is allocated, enter it in the freelist */
@@ -478,7 +704,7 @@ static int extract(gpudata *curr, gpudata *prev, size_t size) {
   } else {
     split = new_gpudata(curr->ctx, curr->ptr + size, remaining);
     if (split == NULL)
-      return GA_MEMORY_ERROR;
+      return curr->ctx->err->code;
     /* Make sure the chain keeps going */
     split->next = curr->next;
     curr->next = NULL;
@@ -496,7 +722,6 @@ static int extract(gpudata *curr, gpudata *prev, size_t size) {
   return GA_NO_ERROR;
 }
 
-static void cuda_free(gpudata *);
 static int cuda_write(gpudata *dst, size_t dstoff, const void *src,
                       size_t sz);
 
@@ -504,56 +729,90 @@ static inline size_t roundup(size_t s, size_t m) {
   return ((s + (m - 1)) / m) * m;
 }
 
-static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags,
-			   int *ret) {
+static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags) {
   gpudata *res = NULL, *prev = NULL;
   cuda_context *ctx = (cuda_context *)c;
   size_t asize;
-  int err;
 
-  if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR);
+  if (size == 0) size = 1;
+
+  if ((flags & GA_BUFFER_INIT) && data == NULL) {
+    error_set(ctx->err, GA_VALUE_ERROR, "Requested buffer initialisation but no data given");
+    return NULL;
+  }
   if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) ==
-      (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR);
+      (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) {
+    error_set(ctx->err, GA_VALUE_ERROR, "Invalid flags combinaison WRITE_ONLY and READ_ONLY");
+    return NULL;
+  }
 
   /* TODO: figure out how to make this work */
-  if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR);
+  if (flags & GA_BUFFER_HOST) {
+    error_set(ctx->err, GA_DEVSUP_ERROR, "Host mapped allocations are not supported yet");
+    return NULL;
+  }
 
   /* We don't want to manage really small allocations so we round up
    * to a multiple of FRAG_SIZE.  This also ensures that if we split a
    * block, the next block starts properly aligned for any data type.
    */
-  if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE)) {
+  if (ctx->max_cache_size != 0) {
     asize = roundup(size, FRAG_SIZE);
     find_best(ctx, &res, &prev, asize);
   } else {
     asize = size;
   }
 
-  if (res == NULL) {
-    err = allocate(ctx, &res, &prev, asize);
-    if (err != GA_NO_ERROR)
-      FAIL(NULL, err);
-  }
+  if (res == NULL && allocate(ctx, &res, &prev, asize) != GA_NO_ERROR)
+    return NULL;
+
+  if (extract(res, prev, asize) != GA_NO_ERROR)
+    return NULL;
 
-  err = extract(res, prev, asize);
-  if (err != GA_NO_ERROR)
-    FAIL(NULL, err);
   /* It's out of the freelist, so add a ref */
   res->ctx->refcnt++;
   /* We consider this buffer allocated and ready to go */
   res->refcnt = 1;
 
   if (flags & GA_BUFFER_INIT) {
-    err = cuda_write(res, 0, data, size);
-    if (err != GA_NO_ERROR) {
+    if (cuda_write(res, 0, data, size) != GA_NO_ERROR) {
       cuda_free(res);
-      FAIL(NULL, err);
+      return NULL;
     }
   }
 
   return res;
 }
 
+int cuda_get_ipc_handle(gpudata *d, GpuArrayIpcMemHandle *h) {
+  ASSERT_BUF(d);
+  cuda_enter(d->ctx);
+  CUDA_EXIT_ON_ERROR(d->ctx,
+                     cuIpcGetMemHandle((CUipcMemHandle *)h, d->ptr));
+  cuda_exit(d->ctx);
+  return GA_NO_ERROR;
+}
+
+gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle *h, size_t sz) {
+  CUdeviceptr p;
+  cuda_context *ctx = (cuda_context *)c;
+  gpudata *d = NULL;
+  CUresult err;
+
+  cuda_enter(ctx);
+  err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)h),
+                           CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+  if (err != CUDA_SUCCESS) {
+    cuda_exit(ctx);
+    error_cuda(ctx->err, "cuIpcOpenMemHandle", err);
+    return NULL;
+  }
+  d = cuda_make_buf(ctx, p, sz);
+  if (d != NULL)
+    d->flags |= CUDA_IPC_MEMORY;
+  return d;
+}
+
 static void cuda_retain(gpudata *d) {
   ASSERT_BUF(d);
   d->refcnt++;
@@ -579,7 +838,10 @@ static void cuda_free(gpudata *d) {
     if (d->flags & DONTFREE) {
       /* This is the path for "external" buffers */
       deallocate(d);
-    } else if (ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE) {
+    } else if (d->flags & CUDA_IPC_MEMORY) {
+      cuIpcCloseMemHandle(d->ptr);
+      deallocate(d);
+    } else if (ctx->max_cache_size == 0) {
       /* Just free the pointer */
       cuMemFree(d->ptr);
       deallocate(d);
@@ -611,8 +873,8 @@ static void cuda_free(gpudata *d) {
           d->ptr + d->sz == next->ptr) {
         d->sz = d->sz + next->sz;
         d->next = next->next;
-        cuda_wait(next, CUDA_WAIT_ALL);
-        cuda_record(d, CUDA_WAIT_ALL);
+        cuda_waits(next, CUDA_WAIT_ALL, d->ls);
+        cuda_records(d, CUDA_WAIT_ALL, d->ls);
         deallocate(next);
       } else {
         d->next = next;
@@ -626,7 +888,7 @@ static void cuda_free(gpudata *d) {
   }
 }
 
-static int cuda_share(gpudata *a, gpudata *b, int *ret) {
+static int cuda_share(gpudata *a, gpudata *b) {
   ASSERT_BUF(a);
   ASSERT_BUF(b);
   return (a->ctx == b->ctx && a->sz != 0 && b->sz != 0 &&
@@ -689,12 +951,15 @@ static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src,
     int res = GA_NO_ERROR;
     ASSERT_BUF(dst);
     ASSERT_BUF(src);
-    if (src->ctx != dst->ctx) return GA_VALUE_ERROR;
+    if (src->ctx != dst->ctx) return error_set(ctx->err, GA_VALUE_ERROR,
+                                               "Cannot move between contexts");
 
     if (sz == 0) return GA_NO_ERROR;
 
-    if ((dst->sz - dstoff) < sz || (src->sz - srcoff) < sz)
-        return GA_VALUE_ERROR;
+    if ((dst->sz - dstoff) < sz)
+      return error_set(ctx->err, GA_VALUE_ERROR, "Destination is smaller than requested transfer size");
+    if ((src->sz - srcoff) < sz)
+      return error_set(ctx->err, GA_VALUE_ERROR, "Source is smaller than requested transfer size");
 
     cuda_enter(ctx);
 
@@ -723,19 +988,17 @@ static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) {
     if (sz == 0) return GA_NO_ERROR;
 
     if ((src->sz - srcoff) < sz)
-        return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "source is smaller than the read size");
 
     cuda_enter(ctx);
 
     if (src->flags & CUDA_MAPPED_PTR) {
+
       if (ISSET(ctx->flags, GA_CTX_SINGLE_STREAM))
-        ctx->err = cuStreamSynchronize(ctx->s);
+        CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s));
       else
-        ctx->err = cuEventSynchronize(src->wev);
-      if (ctx->err != CUDA_SUCCESS) {
-        cuda_exit(ctx);
-        return GA_IMPL_ERROR;
-      }
+        CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(src->wev));
+
       memcpy(dst, (void *)(src->ptr + srcoff), sz);
     } else {
       GA_CUDA_EXIT_ON_ERROR(ctx,
@@ -760,19 +1023,17 @@ static int cuda_write(gpudata *dst, size_t dstoff, const void *src,
     if (sz == 0) return GA_NO_ERROR;
 
     if ((dst->sz - dstoff) < sz)
-        return GA_VALUE_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "Destination is smaller than the write size");
 
     cuda_enter(ctx);
 
     if (dst->flags & CUDA_MAPPED_PTR) {
+
       if (ISSET(ctx->flags, GA_CTX_SINGLE_STREAM))
-        ctx->err = cuStreamSynchronize(ctx->s);
+        CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s));
       else
-        ctx->err = cuEventSynchronize(dst->rev);
-      if (ctx->err != CUDA_SUCCESS) {
-        cuda_exit(ctx);
-        return GA_IMPL_ERROR;
-      }
+        CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(dst->rev));
+
       memcpy((void *)(dst->ptr + dstoff), src, sz);
     } else {
       GA_CUDA_EXIT_ON_ERROR(ctx,
@@ -809,260 +1070,226 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) {
     return GA_NO_ERROR;
 }
 
-static CUresult get_cc(CUdevice dev, int *maj, int *min) {
-#if CUDA_VERSION < 6500
-  return cuDeviceComputeCapability(maj, min, dev);
-#else
-  CUresult lerr;
-  lerr = cuDeviceGetAttribute(maj,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                              dev);
-  if (lerr != CUDA_SUCCESS)
-    return lerr;
-  return cuDeviceGetAttribute(min,
-                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                              dev);
-#endif
+int get_cc(CUdevice dev, int *maj, int *min, error *e) {
+  CUresult err;
+  err = cuDeviceGetAttribute(maj,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             dev);
+  if (err != CUDA_SUCCESS)
+    return error_cuda(e, "cuDeviceGetAttribute", err);
+  err = cuDeviceGetAttribute(min,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             dev);
+  if (err != CUDA_SUCCESS)
+    return error_cuda(e, "cuDeviceGetAttribute", err);
+  return GA_NO_ERROR;
 }
 
-static int detect_arch(const char *prefix, char *ret, CUresult *err) {
+static int detect_arch(const char *prefix, char *ret, error *e) {
   CUdevice dev;
+  CUresult err;
   int major, minor;
   int res;
   size_t sz = strlen(prefix) + 3;
-  *err = cuCtxGetDevice(&dev);
-  if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR;
-  *err = get_cc(dev, &major, &minor);
-  if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR;
+  err = cuCtxGetDevice(&dev);
+  if (err != CUDA_SUCCESS) return error_cuda(e, "cuCtxGetDevice", err);
+  GA_CHECK(get_cc(dev, &major, &minor, e));
   res = snprintf(ret, sz, "%s%d%d", prefix, major, minor);
-  if (res == -1 || res > sz) return GA_UNSUPPORTED_ERROR;
+  if (res == -1) return error_sys(e, "snprintf");
+  if (res > (ssize_t)sz) return error_set(e, GA_UNSUPPORTED_ERROR,
+                                          "detect_arch: arch id is too large");
   return GA_NO_ERROR;
 }
 
-#ifdef WITH_NVRTC
-
-#include <nvrtc.h>
+static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) {
+  return error_fmt(e, GA_IMPL_ERROR, "%s: %s", msg, nvrtcGetErrorString(err));
+}
 
-static void *call_compiler(const char *src, size_t len, const char *arch_arg,
-                           size_t *bin_len, char **log, size_t *log_len,
-                           int *ret) {
+static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
   nvrtcProgram prog;
-  void *buf = NULL;
   size_t buflen;
-  const char *opts[4] = {
+  const char *heads[1] = {"cluda.h"};
+  const char *hsrc[1];
+  const char *opts[] = {
     "-arch", ""
+#ifdef DEBUG
     , "-G", "-lineinfo"
+#endif
   };
-  nvrtcResult err, err2;
+  nvrtcResult err;
+
+  opts[1] = ctx->bin_id;
 
-  opts[1] = arch_arg;
+  hsrc[0] = cluda_cuda_h;
+  err = nvrtcCreateProgram(&prog, src->s, NULL, 1, hsrc, heads);
+  if (err != NVRTC_SUCCESS)
+    return error_nvrtc(ctx->err, "nvrtcCreateProgram", err);
 
-  err = nvrtcCreateProgram(&prog, src, NULL, 0, NULL, NULL);
-  if (err != NVRTC_SUCCESS) FAIL(NULL, GA_SYS_ERROR);
+  err = nvrtcCompileProgram(prog, sizeof(opts)/sizeof(char *), opts);
 
-  err = nvrtcCompileProgram(prog,
+  /* Get the log before handling the error */
+  if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) {
+    strb_appends(log, "NVRTC compile log::\n");
+    if (strb_ensure(log, buflen) == 0)
+      if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS)
+        log->l += buflen - 1; // Remove the final NUL
+    strb_appendc(log, '\n');
+  }
+
+  if (err != NVRTC_SUCCESS) {
+    nvrtcDestroyProgram(&prog);
 #ifdef DEBUG
-                            4,
-#else
-                            2,
+    strb_dump(src, stderr);
+    strb_dump(log, stderr);
 #endif
-                            opts);
-  if (log != NULL) {
-    err2 = nvrtcGetProgramLogSize(prog, &buflen);
-    if (err2 != NVRTC_SUCCESS) goto end2;
-    buf = malloc(buflen);
-    if (buf == NULL) goto end2;
-    err2 = nvrtcGetProgramLog(prog, (char *)buf);
-    if (err2 != NVRTC_SUCCESS) goto end2;
-    if (log_len != NULL) *log_len = buflen;
-    *log = (char *)buf;
-    buf = NULL;
+    return error_nvrtc(ctx->err, "nvrtcCompileProgram", err);
   }
-end2:
-  if (err != NVRTC_SUCCESS) goto end;
 
   err = nvrtcGetPTXSize(prog, &buflen);
-  if (err != NVRTC_SUCCESS) goto end;
-
-  buf = malloc(buflen);
-  if (buf == NULL) {
+  if (err != NVRTC_SUCCESS) {
     nvrtcDestroyProgram(&prog);
-    FAIL(NULL, GA_MEMORY_ERROR);
+    return error_nvrtc(ctx->err, "nvrtcGetPTXSize", err);
   }
 
-  err = nvrtcGetPTX(prog, (char *)buf);
-  if (err != NVRTC_SUCCESS) goto end;
-
-  *bin_len = buflen;
-
-end:
-  nvrtcDestroyProgram(&prog);
-  if (err != NVRTC_SUCCESS) {
-    free(buf);
-    FAIL(NULL, GA_SYS_ERROR);
+  if (strb_ensure(ptx, buflen) == 0) {
+    err = nvrtcGetPTX(prog, ptx->s+ptx->l);
+    if (err != NVRTC_SUCCESS) {
+      nvrtcDestroyProgram(&prog);
+      return error_nvrtc(ctx->err, "nvrtcGetPTX", err);
+    }
+    ptx->l += buflen;
   }
-  return buf;
-}
 
-#else /* WITH_NVRTC */
-
-#include <sys/stat.h>
-
-#include <fcntl.h>
-#include <limits.h>
-
-#ifdef _WIN32
-#include <process.h>
-/* I am really tired of hunting through online docs
- * to find where the define is.  256 seem to be the
- * consensus for the value so there it is.
- */
-#define PATH_MAX 256
-#else
-#include <sys/param.h>
-#include <sys/wait.h>
-#endif
+  return GA_NO_ERROR;
+}
 
-#ifdef _MSC_VER
-#include <io.h>
-#define read _read
-#define write _write
-#define close _close
-#define unlink _unlink
-#define fstat _fstat
-#define open _open
+static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) {
+  char info_log[2048] = "";
+  char error_log[2048] = "";
+  void *out;
+  size_t out_size;
+  CUlinkState st;
+  CUjit_option cujit_opts[] = {
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_INFO_LOG_BUFFER,
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_ERROR_LOG_BUFFER,
+    CU_JIT_LOG_VERBOSE,
+    CU_JIT_GENERATE_DEBUG_INFO,
+    CU_JIT_GENERATE_LINE_INFO,
+  };
+  void *cujit_opt_vals[] = {
+    (void *)sizeof(info_log), info_log,
+    (void *)sizeof(error_log), error_log,
+#ifdef DEBUG
+    (void *)1, (void *)1, (void *)1
 #else
-#include <unistd.h>
+    (void *)0, (void *)0, (void *)0
 #endif
+  };
+  CUresult err;
+  int res = GA_NO_ERROR;
 
-static const char *TMP_VAR_NAMES[] = {"GPUARRAY_TMPDIR", "TMPDIR", "TMP",
-                                      "TEMP", "USERPROFILE"};
-
+  err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]),
+                          cujit_opts, cujit_opt_vals, &st);
+  if (err != CUDA_SUCCESS)
+    return error_cuda(ctx->err, "cuLinkCreate", err);
+  err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l,
+                           "kernel code", 0, NULL, NULL);
+  if (err != CUDA_SUCCESS) {
+    res = error_cuda(ctx->err, "cuLinkAddData", err);
+    goto out;
+  }
+  err = cuLinkComplete(st, &out, &out_size);
+  if (err != CUDA_SUCCESS) {
+    res = error_cuda(ctx->err, "cuLinkComplete", err);
+    goto out;
+  }
+  strb_appendn(bin, out, out_size);
+out:
+  cuLinkDestroy(st);
+  strb_appends(log, "Link info log::\n");
+  strb_appends(log, info_log);
+  strb_appends(log, "\nLink error log::\n");
+  strb_appends(log, error_log);
+  strb_appendc(log, '\n');
+  return res;
+}
 
-static void *call_compiler(const char *src, size_t len, const char *arch_arg,
-                           size_t *bin_len, char **log, size_t *log_len,
-                           int *ret) {
-    char namebuf[PATH_MAX];
-    char outbuf[PATH_MAX];
-    char *tmpdir;
-    struct stat st;
-    ssize_t s;
-#ifndef _WIN32
-    pid_t p;
-#endif
-    unsigned int i;
-    int sys_err;
-    int fd;
-    char *buf;
+static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
+  strb ptx = STRB_STATIC_INIT;
+  strb *cbin;
+  disk_key k;
+  disk_key *pk;
 
-    for (i = 0; i < sizeof(TMP_VAR_NAMES)/sizeof(TMP_VAR_NAMES[0]); i++) {
-        tmpdir = getenv(TMP_VAR_NAMES[i]);
-        if (tmpdir != NULL) break;
-    }
-    if (tmpdir == NULL) {
-#ifdef _WIN32
-      tmpdir = ".";
-#else
-      tmpdir = "/tmp";
+  memset(&k, 0, sizeof(k));
+  k.version = 0;
+#ifdef DEBUG
+  k.debug = 1;
 #endif
+  k.major = ctx->major;
+  k.minor = ctx->minor;
+  memcpy(k.bin_id, ctx->bin_id, 64);
+  memcpy(&k.src, src, sizeof(strb));
+
+  // Look up the binary in the disk cache
+  if (ctx->disk_cache) {
+    cbin = cache_get(ctx->disk_cache, &k);
+    if (cbin != NULL) {
+      strb_appendb(bin, cbin);
+      return GA_NO_ERROR;
     }
+  }
 
-    strlcpy(namebuf, tmpdir, sizeof(namebuf));
-    strlcat(namebuf, "/gpuarray.cuda.XXXXXXXX", sizeof(namebuf));
-
-    fd = mkstemp(namebuf);
-    if (fd == -1) FAIL(NULL, GA_SYS_ERROR);
-
-    strlcpy(outbuf, namebuf, sizeof(outbuf));
-    strlcat(outbuf, ".cubin", sizeof(outbuf));
-
-    /* Don't want to write the final NUL */
-    s = write(fd, src, len-1);
-    close(fd);
-    /* fd is not non-blocking; should have complete write */
-    if (s == -1) {
-        unlink(namebuf);
-        FAIL(NULL, GA_SYS_ERROR);
-    }
+  GA_CHECK(call_compiler(ctx, src, &ptx, log));
 
-    /* This block executes nvcc on the written-out file */
-#ifdef DEBUG
-#define NVCC_ARGS NVCC_BIN, "-g", "-G", "-arch", arch_arg, "-x", "cu", \
-      "--cubin", namebuf, "-o", outbuf
-#else
-#define NVCC_ARGS NVCC_BIN, "-arch", arch_arg, "-x", "cu", \
-      "--cubin", namebuf, "-o", outbuf
-#endif
-#ifdef _WIN32
-    sys_err = _spawnl(_P_WAIT, NVCC_BIN, NVCC_ARGS, NULL);
-    unlink(namebuf);
-    if (sys_err == -1) FAIL(NULL, GA_SYS_ERROR);
-    if (sys_err != 0) FAIL(NULL, GA_RUN_ERROR);
-#else
-    p = fork();
-    if (p == 0) {
-        execl(NVCC_BIN, NVCC_ARGS, NULL);
-        exit(1);
-    }
-    if (p == -1) {
-        unlink(namebuf);
-        FAIL(NULL, GA_SYS_ERROR);
-    }
+  GA_CHECK(make_bin(ctx, &ptx, bin, log));
 
-    /* We need to wait until after the waitpid for the unlink because otherwise
-       we might delete the input file before nvcc is finished with it. */
-    if (waitpid(p, &sys_err, 0) == -1) {
-        unlink(namebuf);
-        unlink(outbuf);
-        FAIL(NULL, GA_SYS_ERROR);
-    } else {
-#ifdef DEBUG
-      /* Only cleanup if GPUARRAY_NOCLEANUP is not set */
-      if (getenv("GPUARRAY_NOCLEANUP") == NULL)
-#endif
-	unlink(namebuf);
-    }
+  strb_clear(&ptx);
 
-    if (WIFSIGNALED(sys_err) || WEXITSTATUS(sys_err) != 0) {
-        unlink(outbuf);
-        FAIL(NULL, GA_RUN_ERROR);
+  if (ctx->disk_cache) {
+    pk = calloc(sizeof(disk_key), 1);
+    if (pk == NULL) {
+      error_sys(ctx->err, "calloc");
+      fprintf(stderr, "Error adding kernel to disk cache: %s\n",
+              ctx->err->msg);
+      return GA_NO_ERROR;
     }
-#endif
-
-    fd = open(outbuf, O_RDONLY);
-    if (fd == -1) {
-        unlink(outbuf);
-        FAIL(NULL, GA_SYS_ERROR);
+    memcpy(pk, &k, DISK_KEY_MM);
+    strb_appendb(&pk->src, src);
+    if (strb_error(&pk->src)) {
+      error_sys(ctx->err, "strb_appendb");
+      fprintf(stderr, "Error adding kernel to disk cache %s\n",
+              ctx->err->msg);
+      disk_free((cache_key_t)pk);
+      return GA_NO_ERROR;
     }
-
-    if (fstat(fd, &st) == -1) {
-        close(fd);
-        unlink(outbuf);
-        FAIL(NULL, GA_SYS_ERROR);
+    cbin = strb_alloc(bin->l);
+    if (cbin == NULL) {
+      error_sys(ctx->err, "strb_alloc");
+      fprintf(stderr, "Error adding kernel to disk cache: %s\n",
+              ctx->err->msg);
+      disk_free((cache_key_t)pk);
+      return GA_NO_ERROR;
     }
-
-    buf = malloc((size_t)st.st_size);
-    if (buf == NULL) {
-        close(fd);
-        unlink(outbuf);
-        FAIL(NULL, GA_SYS_ERROR);
+    strb_appendb(cbin, bin);
+    if (strb_error(cbin)) {
+      error_sys(ctx->err, "strb_appendb");
+      fprintf(stderr, "Error adding kernel to disk cache %s\n",
+              ctx->err->msg);
+      disk_free((cache_key_t)pk);
+      strb_free(cbin);
+      return GA_NO_ERROR;
     }
-
-    s = read(fd, buf, (size_t)st.st_size);
-    close(fd);
-    unlink(outbuf);
-    /* fd is blocking; should have complete read */
-    if (s == -1) {
-      free(buf);
-      FAIL(NULL, GA_SYS_ERROR);
+    if (cache_add(ctx->disk_cache, pk, cbin)) {
+      // TODO use better error messages
+      fprintf(stderr, "Error adding kernel to disk cache\n");
     }
+  }
 
-    *bin_len = (size_t)st.st_size;
-    return buf;
+  return GA_NO_ERROR;
 }
 
-#endif /* WITH_NVRTC */
-
 static void _cuda_freekernel(gpukernel *k) {
   k->refcnt--;
   if (k->refcnt == 0) {
@@ -1080,190 +1307,178 @@ static void _cuda_freekernel(gpukernel *k) {
   }
 }
 
-static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
-                                 const char **strings, const size_t *lengths,
-                                 const char *fname, unsigned int argcount,
-                                 const int *types, int flags, int *ret,
-                                 char **err_str) {
+static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count,
+                          const char **strings, const size_t *lengths,
+                          const char *fname, unsigned int argcount,
+                          const int *types, int flags, char **err_str) {
     cuda_context *ctx = (cuda_context *)c;
-    strb sb = STRB_STATIC_INIT;
-    strb *psb;
-    char *bin, *log = NULL;
+    strb src = STRB_STATIC_INIT;
+    strb bin = STRB_STATIC_INIT;
+    strb log = STRB_STATIC_INIT;
     gpukernel *res;
-    size_t bin_len = 0, log_len = 0;
+    kernel_key k_key;
+    kernel_key *p_key;
     CUdevice dev;
+    CUresult err;
     unsigned int i;
     int major, minor;
 
-    if (count == 0) FAIL(NULL, GA_VALUE_ERROR);
+    if (count == 0)
+      return error_set(ctx->err, GA_VALUE_ERROR, "String count is 0");
 
     if (flags & GA_USE_OPENCL)
-      FAIL(NULL, GA_DEVSUP_ERROR);
-
-    if (flags & GA_USE_BINARY) {
-      // GA_USE_BINARY is exclusive
-      if (flags & ~GA_USE_BINARY)
-        FAIL(NULL, GA_INVALID_ERROR);
-      // We need the length for binary data and there is only one blob.
-      if (count != 1 || lengths == NULL || lengths[0] == 0)
-        FAIL(NULL, GA_VALUE_ERROR);
-    }
+      return error_set(ctx->err, GA_DEVSUP_ERROR, "OpenCL kernel not supported on cuda devices");
 
     cuda_enter(ctx);
 
-    ctx->err = cuCtxGetDevice(&dev);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      FAIL(NULL, GA_IMPL_ERROR);
-    }
-    ctx->err = cuDeviceComputeCapability(&major, &minor, dev);
-    if (ctx->err != CUDA_SUCCESS) {
+    err = cuCtxGetDevice(&dev);
+    if (err != CUDA_SUCCESS) {
       cuda_exit(ctx);
-      FAIL(NULL, GA_IMPL_ERROR);
+      return error_cuda(ctx->err, "cuCtxGetDevice", err);
     }
 
-    // GA_USE_CLUDA is done later
+    if (get_cc(dev, &major, &minor, ctx->err) != GA_NO_ERROR)
+      return ctx->err->code;
+
     // GA_USE_SMALL will always work
+    // GA_USE_HALF should always work
     if (flags & GA_USE_DOUBLE) {
       if (major < 1 || (major == 1 && minor < 3)) {
         cuda_exit(ctx);
-        FAIL(NULL, GA_DEVSUP_ERROR);
+        return error_set(ctx->err, GA_DEVSUP_ERROR, "Requested double support and current device doesn't support them");
       }
     }
     if (flags & GA_USE_COMPLEX) {
       // just for now since it is most likely broken
       cuda_exit(ctx);
-      FAIL(NULL, GA_DEVSUP_ERROR);
+      return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex support is not there yet.");
     }
-    // GA_USE_HALF should always work
 
-    if (flags & GA_USE_BINARY) {
-      bin = memdup(strings[0], lengths[0]);
-      bin_len = lengths[0];
-      if (bin == NULL) {
-        cuda_exit(ctx);
-        FAIL(NULL, GA_MEMORY_ERROR);
-      }
+    if (lengths == NULL) {
+      for (i = 0; i < count; i++)
+        strb_appends(&src, strings[i]);
     } else {
-      if (flags & GA_USE_CLUDA) {
-        strb_appends(&sb, CUDA_PREAMBLE);
+      for (i = 0; i < count; i++) {
+        if (lengths[i] == 0)
+          strb_appends(&src, strings[i]);
+        else
+          strb_appendn(&src, strings[i], lengths[i]);
       }
+    }
 
-      if (lengths == NULL) {
-        for (i = 0; i < count; i++)
-        strb_appends(&sb, strings[i]);
-      } else {
-        for (i = 0; i < count; i++) {
-          if (lengths[i] == 0)
-            strb_appends(&sb, strings[i]);
-          else
-            strb_appendn(&sb, strings[i], lengths[i]);
-        }
-      }
+    strb_append0(&src);
 
-      strb_append0(&sb);
+    if (strb_error(&src)) {
+      strb_clear(&src);
+      cuda_exit(ctx);
+      return error_sys(ctx->err, "strb");
+    }
 
-      if (strb_error(&sb)) {
-        strb_clear(&sb);
-        cuda_exit(ctx);
-        FAIL(NULL, GA_MEMORY_ERROR);
-      }
+    k_key.fname = fname;
+    k_key.src = src;
 
-      res = (gpukernel *)cache_get(ctx->kernel_cache, &sb);
-      if (res != NULL) {
-        res->refcnt++;
-        strb_clear(&sb);
-        return res;
-      }
-      bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len,
-                          &log, &log_len, ret);
-      if (bin == NULL) {
-        if (err_str != NULL) {
-          strb debug_msg = STRB_STATIC_INIT;
-
-          // We're substituting debug_msg for a string with this first line:
-          strb_appends(&debug_msg, "CUDA kernel build failure ::\n");
-
-          /* Delete the final NUL */
-          sb.l--;
-          gpukernel_source_with_line_numbers(1, (const char **)&sb.s,
-                                             &sb.l, &debug_msg);
-
-          if (log != NULL) {
-            strb_appends(&debug_msg, "\nCompiler log:\n");
-            strb_appendn(&debug_msg, log, log_len);
-            free(log);
-          }
-          *err_str = strb_cstr(&debug_msg);
-          // *err_str will be free()d by the caller (see docs in kernel.h)
-        }
-        strb_clear(&sb);
-        cuda_exit(ctx);
-        return NULL;
+    res = (gpukernel *)cache_get(ctx->kernel_cache, &k_key);
+    if (res != NULL) {
+      res->refcnt++;
+      strb_clear(&src);
+      *k = res;
+      return GA_NO_ERROR;
+    }
+
+    if (compile(ctx, &src, &bin, &log) != GA_NO_ERROR) {
+      if (err_str != NULL) {
+        strb debug_msg = STRB_STATIC_INIT;
+        strb_appends(&debug_msg, "CUDA kernel compile failure ::\n");
+        src.l--;
+        gpukernel_source_with_line_numbers(1, (const char **)&src.s,
+                                           &src.l, &debug_msg);
+        strb_appends(&debug_msg, "\nCompile log:\n");
+        strb_appendb(&debug_msg, &log);
+        *err_str = strb_cstr(&debug_msg);
       }
+      strb_clear(&src);
+      strb_clear(&bin);
+      strb_clear(&log);
+      cuda_exit(ctx);
+      return ctx->err->code;
+    }
+    strb_clear(&log);
+
+    if (strb_error(&bin)) {
+      strb_clear(&src);
+      strb_clear(&bin);
+      cuda_exit(ctx);
+      return error_sys(ctx->err, "strb");
     }
 
     res = calloc(1, sizeof(*res));
     if (res == NULL) {
-      free(bin);
-      strb_clear(&sb);
+      strb_clear(&src);
+      strb_clear(&bin);
       cuda_exit(ctx);
-      FAIL(NULL, GA_SYS_ERROR);
+      return error_sys(ctx->err, "calloc");
     }
 
-    res->bin_sz = bin_len;
-    res->bin = bin;
-
+    /* Don't clear bin after this */
+    res->bin_sz = bin.l;
+    res->bin = bin.s;
     res->refcnt = 1;
     res->argcount = argcount;
     res->types = calloc(argcount, sizeof(int));
     if (res->types == NULL) {
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
       cuda_exit(ctx);
-      FAIL(NULL, GA_MEMORY_ERROR);
+      return error_sys(ctx->err, "calloc");
     }
     memcpy(res->types, types, argcount*sizeof(int));
     res->args = calloc(argcount, sizeof(void *));
     if (res->args == NULL) {
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
       cuda_exit(ctx);
-      FAIL(NULL, GA_MEMORY_ERROR);
+      return error_sys(ctx->err, "calloc");
     }
 
-    ctx->err = cuModuleLoadData(&res->m, bin);
-
-    if (ctx->err != CUDA_SUCCESS) {
+    err = cuModuleLoadData(&res->m, bin.s);
+    if (err != CUDA_SUCCESS) {
+      error_cuda(ctx->err, "cuModuleLoadData", err);
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
       cuda_exit(ctx);
-      FAIL(NULL, GA_IMPL_ERROR);
+      return error_cuda(ctx->err, "cuModuleLoadData", err);
     }
 
-    ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
-    if (ctx->err != CUDA_SUCCESS) {
+    err = cuModuleGetFunction(&res->k, res->m, fname);
+    if (err != CUDA_SUCCESS) {
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
       cuda_exit(ctx);
-      FAIL(NULL, GA_IMPL_ERROR);
+      return error_cuda(ctx->err, "cuModuleGetFunction", err);
     }
 
     res->ctx = ctx;
     ctx->refcnt++;
     cuda_exit(ctx);
     TAG_KER(res);
-    psb = memdup(&sb, sizeof(strb));
-    if (psb == NULL) {
-      cuda_freekernel(res);
-      strb_clear(&sb);
-      FAIL(NULL, GA_MEMORY_ERROR);
+    p_key = memdup(&k_key, sizeof(kernel_key));
+    if (p_key != NULL) {
+      p_key->fname = strdup(fname);
+      if (p_key->fname != NULL) {
+        /* One of the refs is for the cache */
+        res->refcnt++;
+        /* If this fails, it will free the key and remove a ref from the
+           kernel. */
+        cache_add(ctx->kernel_cache, p_key, res);
+      } else {
+        free(p_key);
+        strb_clear(&src);
+      }
+    } else {
+      strb_clear(&src);
     }
-    /* One of the refs is for the cache */
-    res->refcnt++;
-    /* If this fails, it will free the key and remove a ref from the kernel. */
-    cache_add(ctx->kernel_cache, psb, res);
-    return res;
+    *k = res;
+    return GA_NO_ERROR;
 }
 
 static void cuda_retainkernel(gpukernel *k) {
@@ -1277,14 +1492,15 @@ static void cuda_freekernel(gpukernel *k) {
 }
 
 static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) {
+  ASSERT_KER(k);
   if (i >= k->argcount)
-    return GA_VALUE_ERROR;
+    return error_set(k->ctx->err, GA_VALUE_ERROR, "index is beyond the last argument");
   k->args[i] = arg;
   return GA_NO_ERROR;
 }
 
 static int cuda_callkernel(gpukernel *k, unsigned int n,
-                           const size_t *bs, const size_t *gs,
+                           const size_t *gs, const size_t *ls,
                            size_t shared, void **args) {
     cuda_context *ctx = k->ctx;
     unsigned int i;
@@ -1305,24 +1521,22 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
 
     switch (n) {
     case 1:
-      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
-                                ctx->s, args, NULL);
+      CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1,
+                                             shared, ctx->s, args, NULL));
       break;
     case 2:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
-                                ctx->s, args, NULL);
+      CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], gs[1], 1,
+                                             ls[0], ls[1], 1, shared,
+                                             ctx->s, args, NULL));
       break;
     case 3:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
-                                shared, ctx->s, args, NULL);
+      CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], gs[1], gs[2],
+                                             ls[0], ls[1], ls[2], shared,
+                                             ctx->s, args, NULL));
       break;
     default:
       cuda_exit(ctx);
-      return GA_VALUE_ERROR;
-    }
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
+      return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions");
     }
 
     for (i = 0; i < k->argcount; i++) {
@@ -1337,28 +1551,18 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
     return GA_NO_ERROR;
 }
 
-static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) {
-  void *res = malloc(k->bin_sz);
-  if (res == NULL)
-    return GA_MEMORY_ERROR;
-  memcpy(res, k->bin, k->bin_sz);
-  *sz = k->bin_sz;
-  *obj = res;
-  return GA_NO_ERROR;
-}
-
 static int cuda_sync(gpudata *b) {
   cuda_context *ctx = (cuda_context *)b->ctx;
   int err = GA_NO_ERROR;
 
   ASSERT_BUF(b);
   cuda_enter(ctx);
-  ctx->err = cuEventSynchronize(b->wev);
-  if (ctx->err != CUDA_SUCCESS)
-    err = GA_IMPL_ERROR;
-  ctx->err = cuEventSynchronize(b->rev);
-  if (ctx->err != CUDA_SUCCESS)
-    err = GA_IMPL_ERROR;
+  if (ctx->flags & GA_CTX_SINGLE_STREAM) {
+    CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s));
+  } else {
+    CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(b->wev));
+    CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(b->rev));
+  }
   cuda_exit(ctx);
   return err;
 }
@@ -1403,13 +1607,6 @@ static int cuda_transfer(gpudata *dst, size_t dstoff,
   return GA_NO_ERROR;
 }
 
-#ifdef WITH_CUDA_CUBLAS
-extern gpuarray_blas_ops cublas_ops;
-#endif  // WITH_CUDA_CUBLAS
-#ifdef WITH_CUDA_NCCL
-extern gpuarray_comm_ops nccl_ops;
-#endif  // WITH_CUDA_NCCL
-
 static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
                          void *res) {
   cuda_context *ctx = NULL;
@@ -1426,129 +1623,56 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
 
   if (prop_id < GA_BUFFER_PROP_START) {
     if (ctx == NULL)
-      return GA_VALUE_ERROR;
+      return error_set(global_err, GA_VALUE_ERROR,
+                       "Attempting to get a context property with no context");
   } else if (prop_id < GA_KERNEL_PROP_START) {
     if (buf == NULL)
-      return GA_VALUE_ERROR;
+      return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR,
+                       "Attempting to get a buffer property with no buffer");
   } else {
     if (k == NULL)
-      return GA_VALUE_ERROR;
+      return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR,
+                       "Attempting to get a kernel property with no kernel");
   }
 
+#define GETPROP(prop, type) do {                                   \
+    cuda_enter(ctx);                                               \
+    CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id));                  \
+    CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetAttribute(&i, (prop), id)); \
+    cuda_exit(ctx);                                                \
+    *((type *)res) = i;                                            \
+  } while(0)
+
   switch (prop_id) {
-    char *s;
     CUdevice id;
     int i;
     size_t sz;
 
   case GA_CTX_PROP_DEVNAME:
     cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    /* 256 is what the CUDA API uses so it's good enough for me */
-    s = malloc(256);
-    if (s == NULL) {
-      cuda_exit(ctx);
-      return GA_MEMORY_ERROR;
-    }
-    ctx->err = cuDeviceGetName(s, 256, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    *((char **)res) = s;
+    CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id));
+    CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetName((char *)res, 256, id));
     cuda_exit(ctx);
     return GA_NO_ERROR;
 
-  case GA_CTX_PROP_MAXLSIZE:
+  case GA_CTX_PROP_UNIQUE_ID:
     cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                                    id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    *((size_t *)res) = i;
-    cuda_exit(ctx);
-    return GA_NO_ERROR;
-
-  case GA_CTX_PROP_LMEMSIZE:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                                    id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    *((size_t *)res) = i;
+    CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id));
+    CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetPCIBusId((char *)res, 13, id));
     cuda_exit(ctx);
     return GA_NO_ERROR;
 
-  case GA_CTX_PROP_NUMPROCS:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i,
-                                    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                                    id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    *((unsigned int *)res) = i;
-    cuda_exit(ctx);
+  case GA_CTX_PROP_LARGEST_MEMBLOCK:
+    *((size_t *)res) = largest_size(ctx);
     return GA_NO_ERROR;
 
-  case GA_CTX_PROP_MAXGSIZE:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                                    id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    *((size_t *)res) = i;
-    cuda_exit(ctx);
+  case GA_CTX_PROP_LMEMSIZE:
+    GETPROP(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, size_t);
     return GA_NO_ERROR;
 
-  case GA_CTX_PROP_BLAS_OPS:
-#ifdef WITH_CUDA_CUBLAS
-    *((gpuarray_blas_ops **)res) = &cublas_ops;
+  case GA_CTX_PROP_NUMPROCS:
+    GETPROP(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, unsigned int);
     return GA_NO_ERROR;
-#else
-    *((void **)res) = NULL;
-    return GA_DEVSUP_ERROR;
-#endif  // WITH_CUDA_CUBLAS
-
-  case GA_CTX_PROP_COMM_OPS:
-#ifdef WITH_CUDA_NCCL
-      *((gpuarray_comm_ops**)res) = &nccl_ops;
-      return GA_NO_ERROR;
-#else
-      *((void**) res) = NULL;
-      return GA_DEVSUP_ERROR;
-#endif  // WITH_CUDA_NCCL
 
   case GA_CTX_PROP_BIN_ID:
     *((const char **)res) = ctx->bin_id;
@@ -1560,15 +1684,15 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
 
   case GA_CTX_PROP_TOTAL_GMEM:
     cuda_enter(ctx);
-    ctx->err = cuMemGetInfo(&sz, (size_t *)res);
+    CUDA_EXIT_ON_ERROR(ctx, cuMemGetInfo(&sz, (size_t *)res));
     cuda_exit(ctx);
-    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;
+    return GA_NO_ERROR;
 
   case GA_CTX_PROP_FREE_GMEM:
     cuda_enter(ctx);
-    ctx->err = cuMemGetInfo((size_t *)res, &sz);
+    CUDA_EXIT_ON_ERROR(ctx, cuMemGetInfo((size_t *)res, &sz));
     cuda_exit(ctx);
-    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;
+    return GA_NO_ERROR;
 
   case GA_CTX_PROP_NATIVE_FLOAT16:
     /* We claim that nobody supports this for now */
@@ -1576,99 +1700,27 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return CUDA_SUCCESS;
 
   case GA_CTX_PROP_MAXGSIZE0:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    cuda_exit(ctx);
-    *((size_t *)res) = i;
+    GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, size_t);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXGSIZE1:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    cuda_exit(ctx);
-    *((size_t *)res) = i;
+    GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, size_t);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXGSIZE2:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    cuda_exit(ctx);
-    *((size_t *)res) = i;
+    GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, size_t);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXLSIZE0:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    cuda_exit(ctx);
-    *((size_t *)res) = i;
+    GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, size_t);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXLSIZE1:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    cuda_exit(ctx);
-    *((size_t *)res) = i;
+    GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, size_t);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXLSIZE2:
-    cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    cuda_exit(ctx);
-    *((size_t *)res) = i;
+    GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, size_t);
     return GA_NO_ERROR;
 
   case GA_BUFFER_PROP_REFCNT:
@@ -1686,27 +1738,15 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
 
   case GA_KERNEL_PROP_MAXLSIZE:
     cuda_enter(ctx);
-    ctx->err = cuFuncGetAttribute(&i,
-                                  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                                  k->k);
+    CUDA_EXIT_ON_ERROR(ctx, cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k));
     cuda_exit(ctx);
-    if (ctx->err != CUDA_SUCCESS)
-      return GA_IMPL_ERROR;
     *((size_t *)res) = i;
     return GA_NO_ERROR;
 
   case GA_KERNEL_PROP_PREFLSIZE:
     cuda_enter(ctx);
-    ctx->err = cuCtxGetDevice(&id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
-    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id);
-    if (ctx->err != CUDA_SUCCESS) {
-      cuda_exit(ctx);
-      return GA_IMPL_ERROR;
-    }
+    CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id));
+    CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id));
     cuda_exit(ctx);
     *((size_t *)res) = i;
     return GA_NO_ERROR;
@@ -1720,7 +1760,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   default:
-    return GA_INVALID_ERROR;
+    return error_fmt(ctx->err, GA_INVALID_ERROR, "Invalid property: %d", prop_id);
   }
 }
 
@@ -1728,13 +1768,12 @@ static const char *cuda_error(gpucontext *c) {
   cuda_context *ctx = (cuda_context *)c;
   const char *errstr = NULL;
   if (ctx == NULL)
-    cuGetErrorString(err, &errstr);
+    return global_err->msg;
   else
-    cuGetErrorString(ctx->err, &errstr);
+    return ctx->err->msg;
   return errstr;
 }
 
-GPUARRAY_LOCAL
 const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count,
                                       cuda_get_device_count,
                                       cuda_init,
@@ -1752,7 +1791,6 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count,
                                       cuda_freekernel,
                                       cuda_kernelsetarg,
                                       cuda_callkernel,
-                                      cuda_kernelbin,
                                       cuda_sync,
                                       cuda_transfer,
                                       cuda_property,
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
index d073880112..75da423c3e 100644
--- a/src/gpuarray_buffer_opencl.c
+++ b/src/gpuarray_buffer_opencl.c
@@ -12,38 +12,44 @@
 #include <string.h>
 #include <limits.h>
 
-#ifdef _MSC_VER
-#define strdup _strdup
-#endif
+#include "loaders/libclblas.h"
+#include "loaders/libclblast.h"
+
+#include "cluda_opencl.h.c"
 
 #define _unused(x) ((void)x)
 #define SSIZE_MIN (-(SSIZE_MAX-1))
 
-static cl_int err;
-
-#define FAIL(v, e) { if (ret) *ret = e; return v; }
-#define CHKFAIL(v) if (err != CL_SUCCESS) FAIL(v, GA_IMPL_ERROR)
-
+extern gpuarray_blas_ops clblas_ops;
+extern gpuarray_blas_ops clblast_ops;
 
-GPUARRAY_LOCAL const gpuarray_buffer_ops opencl_ops;
+const gpuarray_buffer_ops opencl_ops;
 
 static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r);
-static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags,
-                         int *ret);
+static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags);
 static void cl_release(gpudata *b);
 static void cl_free_ctx(cl_ctx *ctx);
-static gpukernel *cl_newkernel(gpucontext *ctx, unsigned int count,
-                               const char **strings, const size_t *lengths,
-                               const char *fname, unsigned int argcount,
-                               const int *types, int flags, int *ret,
-                               char **err_str);
-static const char CL_CONTEXT_PREAMBLE[];
-
-static inline int cl_get_platform_count(unsigned int* platcount) {
+static int cl_newkernel(gpukernel **k, gpucontext *ctx, unsigned int count,
+                        const char **strings, const size_t *lengths,
+                        const char *fname, unsigned int argcount,
+                        const int *types, int flags, char **err_str);
+static const char CL_CONTEXT_PREAMBLE[] =
+"-D __GA_WARP_SIZE=%lu";  // to be filled by cl_make_ctx()
+
+static int setup_done = 0;
+static int setup_lib(error *e) {
+  if (setup_done)
+    return GA_NO_ERROR;
+  GA_CHECK(load_libopencl(e));
+  setup_done = 1;
+  return GA_NO_ERROR;
+}
+
+static int cl_get_platform_count(unsigned int* platcount) {
   cl_uint nump;
-  err = clGetPlatformIDs(0, NULL, &nump);
-  if (err != CL_SUCCESS)
-    return GA_IMPL_ERROR;
+
+  GA_CHECK(setup_lib(global_err));
+  CL_CHECK(global_err, clGetPlatformIDs(0, NULL, &nump));
   *platcount = (unsigned int)nump;
   return GA_NO_ERROR;
 }
@@ -52,96 +58,135 @@ static int cl_get_device_count(unsigned int platform, unsigned int* devcount) {
   cl_platform_id *ps;
   cl_platform_id p;
   cl_uint numd;
-  unsigned int platcount;
+  cl_int err;
+  unsigned int platcount = 0;
+
+  /* This will load the library if needed */
   GA_CHECK(cl_get_platform_count(&platcount));
 
   ps = calloc(sizeof(*ps), platcount);
   if (ps == NULL)
-    return GA_MEMORY_ERROR;
+    return error_sys(global_err, "calloc");
   err = clGetPlatformIDs(platcount, ps, NULL);
   if (err != CL_SUCCESS) {
     free(ps);
-    return GA_IMPL_ERROR;
+    return error_cl(global_err, "clGetPlatformIDs", err);
   }
   p = ps[platform];
 
   err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd);
   free(ps);
   if (err != CL_SUCCESS)
-    return GA_IMPL_ERROR;
+    return error_cl(global_err, "clGetDeviceIds", err);
   *devcount = (unsigned int)numd;
   return GA_NO_ERROR;
 }
 
-static cl_device_id get_dev(cl_context ctx, int *ret) {
+static cl_device_id get_dev(cl_context ctx, error *e) {
   size_t sz;
   cl_device_id res;
   cl_device_id *ids;
   cl_int err;
 
-  err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz);
-  CHKFAIL(NULL);
+  CL_CHECKN(e, clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz));
 
   ids = malloc(sz);
-  if (ids == NULL) FAIL(NULL, GA_MEMORY_ERROR);
+  if (ids == NULL) {
+    error_sys(e, "malloc");
+    return NULL;
+  }
 
   err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sz, ids, NULL);
+  if (err != CL_SUCCESS) {
+    free(ids);
+    error_cl(e, "clContextGetInfo", err);
+    return NULL;
+  }
   res = ids[0];
   free(ids);
-  CHKFAIL(NULL);
   return res;
 }
 
-cl_ctx *cl_make_ctx(cl_context ctx, int flags) {
+cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) {
   cl_ctx *res;
   cl_device_id id;
   cl_command_queue_properties qprop;
   char vendor[32];
   char driver_version[64];
+  char *device_version = NULL;
+  size_t device_version_size = 0;
   cl_uint vendor_id;
+  cl_int err;
   size_t len;
   int64_t v = 0;
   int e = 0;
   size_t warp_size;
   int ret;
-  const char dummy_kern[] = "__kernel void kdummy() {}\n";
+  const char dummy_kern[] = "__kernel void kdummy(__global float *f) { f[0] = 0; }\n";
   strb context_preamble = STRB_STATIC_INIT;
   const char *rlk[1];
   gpukernel *m;
 
-  id = get_dev(ctx, NULL);
+  e = setup_lib(global_err);
+  if (e != GA_NO_ERROR)
+    return NULL;
+  id = get_dev(ctx, global_err);
   if (id == NULL) return NULL;
-  err = clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(qprop),
-                        &qprop, NULL);
-  if (err != CL_SUCCESS) return NULL;
 
-  err = clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL);
-  if (err != CL_SUCCESS)
+  /* Query device version string size */
+  CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VERSION,
+                                        0, NULL, &device_version_size));
+  if (device_version_size > 1024) {
+    error_set(global_err, GA_UNSUPPORTED_ERROR,
+              "device version buffer too large");
     return NULL;
-  err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id,
-                        NULL);
-  if (err != CL_SUCCESS)
-    return NULL;
-  err = clGetDeviceInfo(id, CL_DRIVER_VERSION, sizeof(driver_version),
-                        driver_version, NULL);
-  if (err != CL_SUCCESS)
+  }
+
+  device_version = alloca(device_version_size);
+  CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VERSION,
+                                        device_version_size,
+                                        device_version, NULL));
+  if (device_version[7] == '1' && device_version[9] < '2') {
+    error_set(global_err, GA_UNSUPPORTED_ERROR,
+              "We only support OpenCL 1.2 and up");
     return NULL;
+  }
+
+  CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES,
+                                        sizeof(qprop), &qprop, NULL));
+  CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor),
+                                        vendor, NULL));
+  CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID,
+                                        sizeof(vendor_id), &vendor_id, NULL));
+  CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DRIVER_VERSION,
+                                        sizeof(driver_version),
+                                        driver_version, NULL));
 
   res = malloc(sizeof(*res));
-  if (res == NULL) return NULL;
+  if (res == NULL) {
+    error_sys(global_err, "malloc");
+    return NULL;
+  }
 
   res->ctx = ctx;
   res->ops = &opencl_ops;
-  res->err = CL_SUCCESS;
+  if (error_alloc(&res->err)) {
+    error_set(global_err, GA_SYS_ERROR, "Could not create error context");
+    free(res);
+    return NULL;
+  }
+
   res->refcnt = 1;
   res->exts = NULL;
   res->blas_handle = NULL;
-  res->preamble = NULL;
+  res->options = NULL;
   res->q = clCreateCommandQueue(
     ctx, id,
-    ISSET(flags, GA_CTX_SINGLE_STREAM) ? 0 : qprop&CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+    ISSET(p->flags, GA_CTX_SINGLE_STREAM) ? 0 : qprop&CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
     &err);
   if (res->q == NULL) {
+    error_cl(global_err, "clCreateCommandQueue", err);
+    error_free(res->err);
     free(res);
     return NULL;
   }
@@ -153,10 +198,9 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) {
 
   clRetainContext(res->ctx);
   TAG_CTX(res);
-  res->errbuf = cl_alloc((gpucontext *)res, 8, &v, GA_BUFFER_INIT, &e);
-  if (e != GA_NO_ERROR) {
+  res->errbuf = cl_alloc((gpucontext *)res, 8, &v, GA_BUFFER_INIT);
+  if (res->errbuf == NULL)
     goto fail;
-  }
   res->refcnt--; /* Prevent ref loop */
 
   /* Create per-context OpenCL preamble */
@@ -165,8 +209,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) {
   rlk[0] = dummy_kern;
   len = sizeof(dummy_kern);
   // this dummy kernel does not require a CLUDA preamble
-  m = cl_newkernel((gpucontext *)res, 1, rlk, &len, "kdummy", 0, NULL, 0, &ret, NULL);
-  if (m == NULL)
+  if (cl_newkernel(&m, (gpucontext *)res, 1, rlk, &len, "kdummy", 0, NULL, 0, NULL) != GA_NO_ERROR)
     goto fail;
   ret = cl_property((gpucontext *)res, NULL, m, GA_KERNEL_PROP_PREFLSIZE, &warp_size);
   if (ret != GA_NO_ERROR)
@@ -174,14 +217,25 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) {
 
   // Write the preferred workgroup multiple as GA_WARP_SIZE in preamble
   strb_appendf(&context_preamble, CL_CONTEXT_PREAMBLE, (unsigned long)warp_size);
-  res->preamble = strb_cstr(&context_preamble);
-  if (res->preamble == NULL)
+  res->options = strb_cstr(&context_preamble);
+  if (res->options == NULL)
     goto fail;
 
+  res->blas_handle = NULL;
+  if (load_libclblas(res->err) == GA_NO_ERROR) {
+    res->blas_ops = &clblas_ops;
+  } else if (load_libclblast(res->err) == GA_NO_ERROR) {
+    res->blas_ops = &clblast_ops;
+  } else {
+    res->blas_ops = NULL;
+  }
+
+  res->comm_ops = NULL;
+
   return res;
 
-fail:
-  err = res->err;
+ fail:
+  error_set(global_err, res->err->code, res->err->msg);
   cl_free_ctx(res);
   return NULL;
 }
@@ -192,24 +246,20 @@ cl_command_queue cl_get_stream(gpucontext *ctx) {
 }
 
 static void cl_free_ctx(cl_ctx *ctx) {
-  gpuarray_blas_ops *blas_ops;
-
   ASSERT_CTX(ctx);
+
   assert(ctx->refcnt != 0);
   ctx->refcnt--;
   if (ctx->refcnt == 0) {
-    if (ctx->blas_handle != NULL) {
-      ctx->err = cl_property((gpucontext *)ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops);
-      blas_ops->teardown((gpucontext *)ctx);
-    }
     if (ctx->errbuf != NULL) {
       ctx->refcnt = 2; /* Avoid recursive release */
       cl_release(ctx->errbuf);
     }
     clReleaseCommandQueue(ctx->q);
     clReleaseContext(ctx->ctx);
-    if (ctx->preamble != NULL)
-      free(ctx->preamble);
+    if (ctx->options != NULL)
+      free(ctx->options);
+    error_free(ctx->err);
     CLEAR(ctx);
     free(ctx);
   }
@@ -219,22 +269,29 @@ gpudata *cl_make_buf(gpucontext *c, cl_mem buf) {
   cl_ctx *ctx = (cl_ctx *)c;
   gpudata *res;
   cl_context buf_ctx;
+  cl_int err;
 
   ASSERT_CTX(ctx);
-  ctx->err = clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx),
-                                &buf_ctx, NULL);
-  if (ctx->err != CL_SUCCESS) return NULL;
-  if (buf_ctx != ctx->ctx) return NULL;
+  CL_CHECKN(ctx->err, clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx),
+                                         &buf_ctx, NULL));
+  if (buf_ctx != ctx->ctx) {
+    error_set(ctx->err, GA_VALUE_ERROR, "Requested context doesn't match object context");
+    return NULL;
+  }
 
   res = malloc(sizeof(*res));
-  if (res == NULL) return NULL;
+  if (res == NULL) {
+    error_sys(ctx->err, "malloc");
+    return NULL;
+  }
 
   res->buf = buf;
   res->ev = NULL;
   res->refcnt = 1;
-  ctx->err = clRetainMemObject(buf);
-  if (ctx->err != CL_SUCCESS) {
+  err = clRetainMemObject(buf);
+  if (err != CL_SUCCESS) {
     free(res);
+    error_cl(ctx->err, "clRetainMemObject", err);
     return NULL;
   }
   res->ctx = ctx;
@@ -254,59 +311,10 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
 
 static void cl_releasekernel(gpukernel *k);
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *bs, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args);
 
-static const char CL_PREAMBLE[] =
-  "#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)\n"
-  "#define WITHIN_KERNEL /* empty */\n"
-  "#define KERNEL __kernel\n"
-  "#define GLOBAL_MEM __global\n"
-  "#define LOCAL_MEM __local\n"
-  "#define LOCAL_MEM_ARG __local\n"
-  "#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))\n"
-  "#ifndef NULL\n"
-  "  #define NULL ((void*)0)\n"
-  "#endif\n"
-  "#define LID_0 get_local_id(0)\n"
-  "#define LID_1 get_local_id(1)\n"
-  "#define LID_2 get_local_id(2)\n"
-  "#define LDIM_0 get_local_size(0)\n"
-  "#define LDIM_1 get_local_size(1)\n"
-  "#define LDIM_2 get_local_size(2)\n"
-  "#define GID_0 get_group_id(0)\n"
-  "#define GID_1 get_group_id(1)\n"
-  "#define GID_2 get_group_id(2)\n"
-  "#define GDIM_0 get_num_groups(0)\n"
-  "#define GDIM_1 get_num_groups(1)\n"
-  "#define GDIM_2 get_num_groups(2)\n"
-  "#define ga_bool uchar\n"
-  "#define ga_byte char\n"
-  "#define ga_ubyte uchar\n"
-  "#define ga_short short\n"
-  "#define ga_ushort ushort\n"
-  "#define ga_int int\n"
-  "#define ga_uint uint\n"
-  "#define ga_long long\n"
-  "#define ga_ulong ulong\n"
-  "#define ga_float float\n"
-  "#define ga_double double\n"
-  "#define ga_half half\n"
-  "#define ga_size ulong\n"
-  "#define ga_ssize long\n"
-  "#define load_half(p) vload_half(0, p)\n"
-  "#define store_half(p, v) vstore_half_rtn(v, 0, p)\n"
-  "#define GA_DECL_SHARED_PARAM(type, name) , __local type name[]\n"
-  "#define GA_DECL_SHARED_BODY(type, name)\n";
-
-/* XXX: add complex types, quad types, and longlong */
-/* XXX: add vector types */
-
-static const char CL_CONTEXT_PREAMBLE[] =
-  "#define GA_WARP_SIZE %lu\n";  // to be filled by cl_make_ctx()
-
-static const char *get_error_string(cl_int err) {
-  /* OpenCL 1.0 error codes */
+const char *cl_error_string(cl_int err) {
   switch (err) {
   case CL_SUCCESS:                        return "Success!";
   case CL_DEVICE_NOT_FOUND:               return "Device not found.";
@@ -321,10 +329,8 @@ static const char *get_error_string(cl_int err) {
   case CL_IMAGE_FORMAT_NOT_SUPPORTED:     return "Image format not supported";
   case CL_BUILD_PROGRAM_FAILURE:          return "Program build failure";
   case CL_MAP_FAILURE:                    return "Map failure";
-#ifdef CL_VERSION_1_1
   case CL_MISALIGNED_SUB_BUFFER_OFFSET:   return "Buffer offset improperly aligned";
   case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Event in wait list has an error status";
-#endif
   case CL_INVALID_VALUE:                  return "Invalid value";
   case CL_INVALID_DEVICE_TYPE:            return "Invalid device type";
   case CL_INVALID_PLATFORM:               return "Invalid platform";
@@ -359,36 +365,24 @@ static const char *get_error_string(cl_int err) {
   case CL_INVALID_BUFFER_SIZE:            return "Invalid buffer size";
   case CL_INVALID_MIP_LEVEL:              return "Invalid mip-map level";
   case CL_INVALID_GLOBAL_WORK_SIZE:       return "Invalid global work size";
-#ifdef CL_VERSION_1_1
   case CL_INVALID_PROPERTY:               return "Invalid property";
-#endif
   default: return "Unknown error";
   }
 }
 
 static int check_ext(cl_ctx *ctx, const char *name) {
   cl_device_id dev;
-  size_t sz;
-  int res = 0;
 
   if (ctx->exts == NULL) {
-    dev = get_dev(ctx->ctx, &res);
-    if (dev == NULL) return res;
+    dev = get_dev(ctx->ctx, ctx->err);
+    if (dev == NULL) return ctx->err->code;
 
-    ctx->err = clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, 0, NULL, &sz);
-    if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
-
-    ctx->exts = malloc(sz);
-    if (ctx->exts == NULL) return GA_MEMORY_ERROR;
-
-    ctx->err = clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, sz, ctx->exts, NULL);
-    if (ctx->err != CL_SUCCESS) {
-      free(ctx->exts);
-      ctx->exts = NULL;
-      return GA_IMPL_ERROR;
-    }
+    CL_GET_PROP(ctx->err, clGetDeviceInfo, dev, CL_DEVICE_EXTENSIONS, ctx->exts);
   }
-  return (strstr(ctx->exts, name) == NULL) ? GA_DEVSUP_ERROR : 0;
+  if (strstr(ctx->exts, name) == NULL)
+    return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Unsupported extension %s", name);
+  else
+    return GA_NO_ERROR;
 }
 
 static void
@@ -399,8 +393,7 @@ errcb(const char *errinfo, const void *pi, size_t cb, void *u) {
   fprintf(stderr, "%s\n", errinfo);
 }
 
-static gpucontext *cl_init(int devno, int flags, int *ret) {
-  int platno;
+static gpucontext *cl_init(gpucontext_props *pp) {
   cl_device_id *ds;
   cl_device_id d;
   cl_platform_id *ps;
@@ -412,43 +405,69 @@ static gpucontext *cl_init(int devno, int flags, int *ret) {
   };
   cl_context ctx;
   cl_ctx *res;
+  cl_int err;
+  int platno;
+  int devno;
+  int e;
 
-  platno = devno >> 16;
-  devno &= 0xFFFF;
+  platno = pp->dev >> 16;
+  devno = pp->dev & 0xFFFF;
 
-  err = clGetPlatformIDs(0, NULL, &nump);
-  CHKFAIL(NULL);
+  e = setup_lib(global_err);
+  if (e != GA_NO_ERROR)
+    return NULL;
 
-  if ((unsigned int)platno >= nump || platno < 0) FAIL(NULL, GA_VALUE_ERROR);
+  CL_CHECKN(global_err, clGetPlatformIDs(0, NULL, &nump));
+
+  if ((unsigned int)platno >= nump || platno < 0) {
+    error_set(global_err, GA_VALUE_ERROR, "Platform ID out of range");
+    return NULL;
+  }
 
   ps = calloc(sizeof(*ps), nump);
-  if (ps == NULL) FAIL(NULL, GA_MEMORY_ERROR);
+  if (ps == NULL) {
+    error_sys(global_err, "calloc");
+    return NULL;
+  }
   err = clGetPlatformIDs(nump, ps, NULL);
   /* We may get garbage on failure here but it won't matter as we will
      not use it */
   p = ps[platno];
   free(ps);
-  CHKFAIL(NULL);
+  if (err != CL_SUCCESS) {
+    error_cl(global_err, "clGetPlatformIDs", err);
+    return NULL;
+  }
 
-  err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd);
-  CHKFAIL(NULL);
+  CL_CHECKN(global_err, clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd));
 
-  if ((unsigned int)devno >= numd || devno < 0) FAIL(NULL, GA_VALUE_ERROR);
+  if ((unsigned int)devno >= numd || devno < 0) {
+    error_set(global_err, GA_VALUE_ERROR, "Device ID out of range");
+    return NULL;
+  }
 
   ds = calloc(sizeof(*ds), numd);
-  if (ds == NULL) FAIL(NULL, GA_MEMORY_ERROR);
+  if (ds == NULL) {
+    error_sys(global_err, "calloc");
+    return NULL;
+  }
   err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, numd, ds, NULL);
   d = ds[devno];
   free(ds);
-  CHKFAIL(NULL);
+  if (err != CL_SUCCESS) {
+    error_cl(global_err, "clGetDeviceIDs", err);
+    return NULL;
+  }
 
   props[1] = (cl_context_properties)p;
   ctx = clCreateContext(props, 1, &d, errcb, NULL, &err);
-  CHKFAIL(NULL);
+  if (ctx == NULL) {
+    error_cl(global_err, "clCreateContext", err);
+    return NULL;
+  }
 
-  res = cl_make_ctx(ctx, flags);
+  res = cl_make_ctx(ctx, pp);
   clReleaseContext(ctx);
-  if (res == NULL) FAIL(NULL, GA_IMPL_ERROR);  // can also be a sys_error
   return (gpucontext *)res;
 }
 
@@ -457,17 +476,20 @@ static void cl_deinit(gpucontext *c) {
   cl_free_ctx((cl_ctx *)c);
 }
 
-static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags,
-                         int *ret) {
+static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags) {
   cl_ctx *ctx = (cl_ctx *)c;
   gpudata *res;
   void *hostp = NULL;
+  cl_int err;
   cl_mem_flags clflags = CL_MEM_READ_WRITE;
 
   ASSERT_CTX(ctx);
 
   if (flags & GA_BUFFER_INIT) {
-    if (data == NULL) FAIL(NULL, GA_VALUE_ERROR);
+    if (data == NULL) {
+      error_set(ctx->err, GA_VALUE_ERROR, "Requested initialization, but no data provided");
+      return NULL;
+    }
     hostp = data;
     clflags |= CL_MEM_COPY_HOST_PTR;
   }
@@ -477,17 +499,24 @@ static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags,
   }
 
   if (flags & GA_BUFFER_READ_ONLY) {
-    if (flags & GA_BUFFER_WRITE_ONLY) FAIL(NULL, GA_VALUE_ERROR);
+    if (flags & GA_BUFFER_WRITE_ONLY) {
+      error_set(ctx->err, GA_VALUE_ERROR, "Invalid combinaison: READ_ONLY and WRITE_ONLY");
+      return NULL;
+    }
+    clflags &= ~CL_MEM_READ_WRITE;
     clflags |= CL_MEM_READ_ONLY;
   }
 
   if (flags & GA_BUFFER_WRITE_ONLY) {
-    if (flags & GA_BUFFER_READ_ONLY) FAIL(NULL, GA_VALUE_ERROR);
+    clflags &= ~CL_MEM_READ_WRITE;
     clflags |= CL_MEM_WRITE_ONLY;
   }
 
   res = malloc(sizeof(*res));
-  if (res == NULL) FAIL(NULL, GA_SYS_ERROR);
+  if (res == NULL) {
+    error_sys(ctx->err, "malloc");
+    return NULL;
+  }
   res->refcnt = 1;
 
   if (size == 0) {
@@ -495,11 +524,12 @@ static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags,
     size = 1;
   }
 
-  res->buf = clCreateBuffer(ctx->ctx, clflags, size, hostp, &ctx->err);
+  res->buf = clCreateBuffer(ctx->ctx, clflags, size, hostp, &err);
   res->ev = NULL;
-  if (ctx->err != CL_SUCCESS) {
+  if (err != CL_SUCCESS) {
     free(res);
-    FAIL(NULL, GA_IMPL_ERROR);
+    error_cl(ctx->err, "clCreateBuffer", err);
+    return NULL;
   }
 
   res->ctx = ctx;
@@ -527,28 +557,32 @@ static void cl_release(gpudata *b) {
   }
 }
 
-static int cl_share(gpudata *a, gpudata *b, int *ret) {
-#ifdef CL_VERSION_1_1
+static int cl_share(gpudata *a, gpudata *b) {
   cl_ctx *ctx;
   cl_mem aa, bb;
-#endif
+  cl_int err;
+
   ASSERT_BUF(a);
   ASSERT_BUF(b);
   if (a->buf == b->buf) return 1;
-#ifdef CL_VERSION_1_1
   if (a->ctx != b->ctx) return 0;
   ctx = a->ctx;
   ASSERT_CTX(ctx);
-  ctx->err = clGetMemObjectInfo(a->buf, CL_MEM_ASSOCIATED_MEMOBJECT,
-				sizeof(aa), &aa, NULL);
-  CHKFAIL(-1);
-  ctx->err = clGetMemObjectInfo(b->buf, CL_MEM_ASSOCIATED_MEMOBJECT,
-				sizeof(bb), &bb, NULL);
-  CHKFAIL(-1);
+  err = clGetMemObjectInfo(a->buf, CL_MEM_ASSOCIATED_MEMOBJECT,
+                           sizeof(aa), &aa, NULL);
+  if (err != CL_SUCCESS) {
+    error_cl(ctx->err, "clGetMemObjectInfo", err);
+    return -1;
+  }
+  err = clGetMemObjectInfo(b->buf, CL_MEM_ASSOCIATED_MEMOBJECT,
+                           sizeof(bb), &bb, NULL);
+  if (err != CL_SUCCESS) {
+    error_cl(ctx->err, "clGetMemObjectInfo", err);
+    return -1;
+  }
   if (aa == NULL) aa = a->buf;
   if (bb == NULL) bb = b->buf;
   if (aa == bb) return 1;
-#endif
   return 0;
 }
 
@@ -563,7 +597,10 @@ static int cl_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff,
   ASSERT_BUF(dst);
   ASSERT_BUF(src);
 
-  if (dst->ctx != src->ctx) return GA_VALUE_ERROR;
+  if (dst->ctx != src->ctx) {
+    error_set(src->ctx->err, GA_VALUE_ERROR, "Differing contexts for source and destination");
+    return error_set(dst->ctx->err, src->ctx->err->code, src->ctx->err->msg);
+  }
   ctx = dst->ctx;
 
   ASSERT_CTX(ctx);
@@ -578,11 +615,8 @@ static int cl_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff,
   if (num_ev > 0)
     evl = evw;
 
-  ctx->err = clEnqueueCopyBuffer(ctx->q, src->buf, dst->buf, srcoff, dstoff,
-				 sz, num_ev, evl, &ev);
-  if (ctx->err != CL_SUCCESS) {
-    return GA_IMPL_ERROR;
-  }
+  CL_CHECK(ctx->err, clEnqueueCopyBuffer(ctx->q, src->buf, dst->buf, srcoff,
+                                         dstoff, sz, num_ev, evl, &ev));
   if (src->ev != NULL)
     clReleaseEvent(src->ev);
   if (dst->ev != NULL && src != dst)
@@ -612,9 +646,9 @@ static int cl_read(void *dst, gpudata *src, size_t srcoff, size_t sz) {
     num_ev = 1;
   }
 
-  ctx->err = clEnqueueReadBuffer(ctx->q, src->buf, CL_TRUE, srcoff, sz, dst,
-				 num_ev, evl, NULL);
-  if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
+  CL_CHECK(ctx->err, clEnqueueReadBuffer(ctx->q, src->buf, CL_TRUE, srcoff, sz,
+                                         dst, num_ev, evl, NULL));
+
   if (src->ev != NULL) clReleaseEvent(src->ev);
   src->ev = NULL;
 
@@ -638,9 +672,9 @@ static int cl_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) {
     num_ev = 1;
   }
 
-  ctx->err = clEnqueueWriteBuffer(ctx->q, dst->buf, CL_TRUE, dstoff, sz, src,
-				  num_ev, evl, NULL);
-  if (err != CL_SUCCESS) return GA_IMPL_ERROR;
+  CL_CHECK(ctx->err, clEnqueueWriteBuffer(ctx->q, dst->buf, CL_TRUE, dstoff,
+                                          sz, src, num_ev, evl, NULL));
+
   if (dst->ev != NULL) clReleaseEvent(dst->ev);
   dst->ev = NULL;
 
@@ -656,7 +690,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
   gpukernel *m;
   cl_mem_flags fl;
   int type;
-  int r, res = GA_IMPL_ERROR;
+  int r, res;
 
   unsigned char val = (unsigned char)data;
   cl_uint pattern = (cl_uint)val & (cl_uint)val >> 8 & \
@@ -665,14 +699,14 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
   ASSERT_BUF(dst);
   ASSERT_CTX(ctx);
 
-  ctx->err = clGetMemObjectInfo(dst->buf, CL_MEM_FLAGS, sizeof(fl), &fl, NULL);
-  if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
+  CL_CHECK(ctx->err, clGetMemObjectInfo(dst->buf, CL_MEM_FLAGS, sizeof(fl),
+                                        &fl, NULL));
 
-  if (fl & CL_MEM_READ_ONLY) return GA_READONLY_ERROR;
+  if (fl & CL_MEM_READ_ONLY)
+    return error_set(ctx->err, GA_READONLY_ERROR, "destination is read only");
 
-  ctx->err = clGetMemObjectInfo(dst->buf, CL_MEM_SIZE, sizeof(bytes), &bytes,
-				NULL);
-  if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
+  CL_CHECK(ctx->err, clGetMemObjectInfo(dst->buf, CL_MEM_SIZE, sizeof(bytes),
+                                        &bytes, NULL));
 
   bytes -= offset;
 
@@ -706,8 +740,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
                  "i += get_global_size(0)) {mem[i] = %u; }}",
                  offset, n, pattern);
   } else {
-    if (check_ext(ctx, CL_SMALL))
-      return GA_DEVSUP_ERROR;
+    GA_CHECK(check_ext(ctx, CL_SMALL));
     n = bytes;
     r = snprintf(local_kern, sizeof(local_kern),
                  "__kernel void kmemset(__global unsigned char *mem) {"
@@ -717,22 +750,23 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
                  offset, n, val);
   }
   /* If this assert fires, increase the size of local_kern above. */
-  assert(r <= sizeof(local_kern));
+  assert(r <= (int)sizeof(local_kern));
   _unused(r);
 
   sz = strlen(local_kern);
   rlk[0] = local_kern;
   type = GA_BUFFER;
 
-  m = cl_newkernel((gpucontext *)ctx, 1, rlk, &sz, "kmemset", 1, &type, 0, &res, NULL);
-  if (m == NULL) return res;
+  r = cl_newkernel(&m, (gpucontext *)ctx, 1, rlk, &sz, "kmemset", 1, &type, 0, NULL);
+  if (r != GA_NO_ERROR)
+    return r;
 
   /* Cheap kernel scheduling */
   res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls);
   if (res != GA_NO_ERROR) goto fail;
   gs = ((n-1) / ls) + 1;
   args[0] = dst;
-  res = cl_callkernel(m, 1, &ls, &gs, 0, args);
+  res = cl_callkernel(m, 1, &gs, &ls, 0, args);
 
  fail:
   cl_releasekernel(m);
@@ -741,119 +775,115 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
 
 static int cl_check_extensions(const char **preamble, unsigned int *count,
                                int flags, cl_ctx *ctx) {
-  if (flags & GA_USE_CLUDA) {
-    // add the common preamble
-    preamble[*count] = CL_PREAMBLE;
-    (*count)++;
-    // add the per-context preamble
-    preamble[*count] = ctx->preamble;
-    (*count)++;
-  }
   if (flags & GA_USE_SMALL) {
-    if (check_ext(ctx, CL_SMALL)) return GA_DEVSUP_ERROR;
+    GA_CHECK(check_ext(ctx, CL_SMALL));
     preamble[*count] = PRAGMA CL_SMALL ENABLE;
     (*count)++;
   }
   if (flags & GA_USE_DOUBLE) {
-    if (check_ext(ctx, CL_DOUBLE)) return GA_DEVSUP_ERROR;
+    if (check_ext(ctx, CL_DOUBLE) != GA_NO_ERROR) {
+      return error_set(ctx->err, GA_DEVSUP_ERROR, "This device does not support double precision (pygpu int/int, int32+float32, and floating point literals default to double precision)");
+    }
     preamble[*count] = PRAGMA CL_DOUBLE ENABLE;
     (*count)++;
   }
   if (flags & GA_USE_COMPLEX) {
-    return GA_DEVSUP_ERROR; // for now
-  }
-  // GA_USE_HALF should always work
-  /*
-  if (flags & GA_USE_HALF) {
-    if (check_ext(ctx, CL_HALF)) return GA_DEVSUP_ERROR;
-    preamble[*count] = PRAGMA CL_HALF ENABLE;
-    (*count)++;
+    return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex are not supported yet");
   }
-  */
   if (flags & GA_USE_CUDA) {
-    return GA_DEVSUP_ERROR;
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Cuda kernels not supported on opencl devices");
   }
   return GA_NO_ERROR;
 }
 
-static gpukernel *cl_newkernel(gpucontext *c, unsigned int count,
-                               const char **strings, const size_t *lengths,
-                               const char *fname, unsigned int argcount,
-                               const int *types, int flags, int *ret,
-                               char **err_str) {
+static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count,
+                        const char **strings, const size_t *lengths,
+                        const char *fname, unsigned int argcount,
+                        const int *types, int flags, char **err_str) {
   cl_ctx *ctx = (cl_ctx *)c;
   gpukernel *res;
   cl_device_id dev;
   cl_program p;
+  cl_program cluda;
+  cl_program tmp;
   // Sync this table size with the number of flags that can add stuff
   // at the beginning
   const char *preamble[5];
+  const char *cluda_src[1];
+  const char *headers[1] = {"cluda.h"};
   size_t *newl = NULL;
   const char **news = NULL;
+  cl_int err;
   unsigned int n = 0;
-  int error;
   strb debug_msg = STRB_STATIC_INIT;
   size_t log_size;
 
   ASSERT_CTX(ctx);
 
-  if (count == 0) FAIL(NULL, GA_VALUE_ERROR);
-
-  dev = get_dev(ctx->ctx, ret);
-  if (dev == NULL) return NULL;
-
-  if (flags & GA_USE_BINARY) {
-    // GA_USE_BINARY is exclusive
-    if (flags & ~GA_USE_BINARY)
-      FAIL(NULL, GA_INVALID_ERROR);
-    // We need the length for binary data and there is only one blob.
-    if (count != 1 || lengths == NULL || lengths[0] == 0)
-      FAIL(NULL, GA_VALUE_ERROR);
-    p = clCreateProgramWithBinary(ctx->ctx, 1, &dev, lengths, (const unsigned char **)strings, NULL, &ctx->err);
-    if (ctx->err != CL_SUCCESS) {
-      clReleaseProgram(p);
-      FAIL(NULL, GA_IMPL_ERROR);
+  if (count == 0)
+    return error_set(ctx->err, GA_VALUE_ERROR, "Empty kernel source list");
+
+  dev = get_dev(ctx->ctx, ctx->err);
+  if (dev == NULL) return ctx->err->code;
+
+  if (cl_check_extensions(preamble, &n, flags, ctx))
+    return ctx->err->code;
+
+  if (n != 0) {
+    news = calloc(count+n, sizeof(const char *));
+    if (news == NULL)
+      return error_sys(ctx->err, "calloc");
+    memcpy(news, preamble, n*sizeof(const char *));
+    memcpy(news+n, strings, count*sizeof(const char *));
+    if (lengths == NULL) {
+      newl = NULL;
+    } else {
+      newl = calloc(count+n, sizeof(size_t));
+      if (newl == NULL) {
+        free(news);
+        return error_sys(ctx->err, "calloc");
+      }
+      memcpy(newl+n, lengths, count*sizeof(size_t));
     }
   } else {
+    news = strings;
+    newl = (size_t *)lengths;
+  }
 
-    error = cl_check_extensions(preamble, &n, flags, ctx);
-    if (error != GA_NO_ERROR) FAIL(NULL, error);
-
+  cluda_src[0] = cluda_opencl_h;
+  cluda = clCreateProgramWithSource(ctx->ctx, 1, cluda_src, NULL, &err);
+  if (err != CL_SUCCESS) {
     if (n != 0) {
-      news = calloc(count+n, sizeof(const char *));
-      if (news == NULL) {
-        FAIL(NULL, GA_SYS_ERROR);
-      }
-      memcpy(news, preamble, n*sizeof(const char *));
-      memcpy(news+n, strings, count*sizeof(const char *));
-      if (lengths == NULL) {
-        newl = NULL;
-      } else {
-        newl = calloc(count+n, sizeof(size_t));
-        if (newl == NULL) {
-          free(news);
-          FAIL(NULL, GA_MEMORY_ERROR);
-        }
-        memcpy(newl+n, lengths, count*sizeof(size_t));
-      }
-    } else {
-      news = strings;
-      newl = (size_t *)lengths;
+      free(news);
+      free(newl);
     }
+    return error_cl(ctx->err, "clCreateProgramWithSource (header)", err);
+  }
 
-    p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &ctx->err);
-    if (ctx->err != CL_SUCCESS) {
-      if (n != 0) {
-        free(news);
-        free(newl);
-      }
-      FAIL(NULL, GA_IMPL_ERROR);
+  p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &err);
+  if (err != CL_SUCCESS) {
+    if (n != 0) {
+      free(news);
+      free(newl);
+      clReleaseProgram(cluda);
     }
+    return error_cl(ctx->err, "clCreateProgramWithSource (kernel)", err);
   }
 
-  ctx->err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
-  if (ctx->err != CL_SUCCESS) {
-    if (ctx->err == CL_BUILD_PROGRAM_FAILURE && err_str!=NULL) {
+  err = clCompileProgram(p, 0, NULL, ctx->options, 1, &cluda, headers, NULL, NULL);
+  if (err != CL_SUCCESS)
+    goto compile_error;
+
+  tmp = clLinkProgram(ctx->ctx, 0, NULL, NULL, 1, &p, NULL, NULL, &err);
+  if (tmp != NULL) {
+    clReleaseProgram(p);
+    p = tmp;
+    tmp = NULL;
+  }
+ compile_error:
+  if (err != CL_SUCCESS) {
+    if ((err == CL_COMPILE_PROGRAM_FAILURE || err == CL_LINK_PROGRAM_FAILURE)
+        && err_str != NULL) {
       *err_str = NULL;  // Fallback, in case there's an error
 
       // We're substituting debug_msg for a string with this first line:
@@ -862,21 +892,17 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count,
       // Determine the size of the log
       clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
 
-      if(strb_ensure(&debug_msg, log_size)!=-1 && log_size>=1) { // Checks strb has enough space
+      if (strb_ensure(&debug_msg, log_size)!=-1 && log_size>=1) { // Checks strb has enough space
         // Get the log directly into the debug_msg
         clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size, debug_msg.s+debug_msg.l, NULL);
         debug_msg.l += (log_size-1); // Back off to before final '\0'
       }
 
-      if (flags & GA_USE_BINARY) {
-        // Not clear what to do with binary 'source' - the log will have to suffice
-      } else {
-        gpukernel_source_with_line_numbers(count+n, news, newl, &debug_msg);
-      }
+      gpukernel_source_with_line_numbers(count+n, news, newl, &debug_msg);
 
       strb_append0(&debug_msg); // Make sure a final '\0' is present
 
-      if(!strb_error(&debug_msg)) { // Make sure the strb is in a valid state
+      if (!strb_error(&debug_msg)) { // Make sure the strb is in a valid state
         *err_str = memdup(debug_msg.s, debug_msg.l);
         // If there's a memory alloc error, fall-through : announcing a compile error is more important
       }
@@ -889,7 +915,7 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count,
       free(news);
       free(newl);
     }
-    FAIL(NULL, GA_IMPL_ERROR);
+    return error_cl(ctx->err, "clBuildProgram", err);
   }
 
   if (n != 0) {
@@ -898,35 +924,38 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count,
   }
 
   res = malloc(sizeof(*res));
-  if (res == NULL) FAIL(NULL, GA_MEMORY_ERROR);
+  if (res == NULL)
+    return error_sys(ctx->err, "malloc");
+
   res->refcnt = 1;
   res->ev = NULL;
   res->argcount = argcount;
-  res->k = clCreateKernel(p, fname, &ctx->err);
+  res->k = clCreateKernel(p, fname, &err);
   res->types = NULL;  /* This avoids a crash in cl_releasekernel */
   res->evr = NULL;   /* This avoids a crash in cl_releasekernel */
   res->ctx = ctx;
   ctx->refcnt++;
   clReleaseProgram(p);
   TAG_KER(res);
-  if (ctx->err != CL_SUCCESS) {
+  if (err != CL_SUCCESS) {
     cl_releasekernel(res);
-    FAIL(NULL, GA_IMPL_ERROR);
+    return error_cl(ctx->err, "clCreateKernel", err);
   }
   res->types = calloc(argcount, sizeof(int));
   if (res->types == NULL) {
     cl_releasekernel(res);
-    FAIL(NULL, GA_IMPL_ERROR);
+    return error_sys(ctx->err, "calloc");
   }
   memcpy(res->types, types, argcount * sizeof(int));
 
   res->evr = calloc(argcount, sizeof(cl_event *));
   if (res->evr == NULL) {
     cl_releasekernel(res);
-    FAIL(NULL, GA_IMPL_ERROR);
+    return error_sys(ctx->err, "calloc");
   }
 
-  return res;
+  *k = res;
+  return GA_NO_ERROR;
 }
 
 static void cl_retainkernel(gpukernel *k) {
@@ -955,35 +984,30 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) {
   cl_ulong temp;
   cl_long stemp;
   switch (k->types[i]) {
-  case GA_POINTER:
-    return GA_DEVSUP_ERROR;
   case GA_BUFFER:
     btmp = (gpudata *)a;
-    ctx->err = clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf);
+    CL_CHECK(ctx->err, clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf));
     k->evr[i] = &btmp->ev;
     break;
   case GA_SIZE:
     temp = *((size_t *)a);
-    ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_ULONG), &temp);
+    CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_ULONG), &temp));
     k->evr[i] = NULL;
     break;
   case GA_SSIZE:
     stemp = *((ssize_t *)a);
-    ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_LONG), &stemp);
+    CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_LONG), &stemp));
     k->evr[i] = NULL;
     break;
   default:
-    ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]), a);
+    CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]), a));
     k->evr[i] = NULL;
   }
-  if (ctx->err != CL_SUCCESS) {
-    return GA_IMPL_ERROR;
-  }
   return GA_NO_ERROR;
 }
 
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *ls, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args) {
   cl_ctx *ctx = k->ctx;
   size_t _gs[3];
@@ -992,33 +1016,31 @@ static int cl_callkernel(gpukernel *k, unsigned int n,
   cl_device_id dev;
   cl_uint num_ev;
   cl_uint i;
-  int res = 0;
+  cl_int err;
 
   ASSERT_KER(k);
   ASSERT_CTX(ctx);
 
   if (n > 3)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions");
 
-  dev = get_dev(ctx->ctx, &res);
-  if (dev == NULL) return res;
+  dev = get_dev(ctx->ctx, ctx->err);
+  if (dev == NULL) return ctx->err->code;
 
   if (args != NULL) {
     for (i = 0; i < k->argcount; i++) {
-      err = cl_setkernelarg(k, i, args[i]);
-      if (err != GA_NO_ERROR) return err;
+      GA_CHECK(cl_setkernelarg(k, i, args[i]));
     }
   }
 
   if (shared != 0) {
     // the shared memory pointer must be the last argument
-    ctx->err = clSetKernelArg(k->k, k->argcount, shared, NULL);
-    if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
+    CL_CHECK(ctx->err, clSetKernelArg(k->k, k->argcount, shared, NULL));
   }
 
   evw = calloc(sizeof(cl_event), k->argcount);
   if (evw == NULL) {
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   }
 
   num_ev = 0;
@@ -1041,10 +1063,11 @@ static int cl_callkernel(gpukernel *k, unsigned int n,
   case 1:
     _gs[0] = gs[0] * ls[0];
   }
-  ctx->err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls,
-				    num_ev, evw, &ev);
+  err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls,
+                                    num_ev, evw, &ev);
   free(evw);
-  if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR;
+  if (err != CL_SUCCESS)
+    return error_cl(ctx->err, "clEnqueueNDRangeKernel", err);
 
   for (i = 0; i < k->argcount; i++) {
     if (k->types[i] == GA_BUFFER) {
@@ -1061,34 +1084,6 @@ static int cl_callkernel(gpukernel *k, unsigned int n,
   return GA_NO_ERROR;
 }
 
-static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) {
-  cl_ctx *ctx = k->ctx;
-  cl_program p;
-  size_t rsz;
-  void *res;
-
-  ASSERT_KER(k);
-  ASSERT_CTX(ctx);
-
-  ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL);
-  if (ctx->err != CL_SUCCESS)
-    return GA_IMPL_ERROR;
-  ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL);
-  if (ctx->err != CL_SUCCESS)
-    return GA_IMPL_ERROR;
-  res = malloc(rsz);
-  if (res == NULL)
-    return GA_MEMORY_ERROR;
-  ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL);
-  if (ctx->err != CL_SUCCESS) {
-    free(res);
-    return GA_IMPL_ERROR;
-  }
-  *sz = rsz;
-  *obj = res;
-  return GA_NO_ERROR;
-}
-
 static int cl_sync(gpudata *b) {
   cl_ctx *ctx = (cl_ctx *)b->ctx;
 
@@ -1096,9 +1091,7 @@ static int cl_sync(gpudata *b) {
   ASSERT_CTX(ctx);
 
   if (b->ev != NULL) {
-    ctx->err = clWaitForEvents(1, &b->ev);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
+    CL_CHECK(ctx->err, clWaitForEvents(1, &b->ev));
     clReleaseEvent(b->ev);
     b->ev = NULL;
   }
@@ -1110,12 +1103,10 @@ static int cl_transfer(gpudata *dst, size_t dstoff,
   ASSERT_BUF(dst);
   ASSERT_BUF(src);
 
-  return GA_UNSUPPORTED_ERROR;
+  return error_set(dst->ctx->err, GA_UNSUPPORTED_ERROR, "Operation not supported");
 }
 
-#ifdef WITH_OPENCL_CLBLAS
-extern gpuarray_blas_ops clblas_ops;
-#endif
+#define clipto_sizet(x) (((x) < SIZE_MAX) ? (x) : SIZE_MAX)
 
 static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
                        void *res) {
@@ -1133,124 +1124,48 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
 
   if (prop_id < GA_BUFFER_PROP_START) {
     if (ctx == NULL)
-      return GA_VALUE_ERROR;
+      return error_set(global_err, GA_VALUE_ERROR, "Requesting context property with no context");
   } else if (prop_id < GA_KERNEL_PROP_START) {
     if (buf == NULL)
-      return GA_VALUE_ERROR;
+      return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Requesting buffer property with no buffer");
   } else {
     if (k == NULL)
-      return GA_VALUE_ERROR;
+      return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Requesting kernel property with no kernel");
   }
 
   switch (prop_id) {
-    char *s;
     size_t sz;
     size_t *psz;
+    cl_ulong ul;
     cl_device_id id;
     cl_uint ui;
 
   case GA_CTX_PROP_DEVNAME:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_NAME, 0, NULL, &sz);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    s = malloc(sz);
-    if (s == NULL)
-      return GA_MEMORY_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_NAME, sz, s, NULL);
-    if (ctx->err != CL_SUCCESS) {
-      free(s);
-      return GA_IMPL_ERROR;
-    }
-    *((char **)res) = s;
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_NAME, 256, (char *)res,
+                                       NULL));
     return GA_NO_ERROR;
 
-  case GA_CTX_PROP_MAXLSIZE:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL,
-                               &sz);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    psz = malloc(sz);
-    if (psz == NULL)
-      return GA_MEMORY_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL);
-    if (ctx->err != CL_SUCCESS) {
-      free(psz);
-      return GA_IMPL_ERROR;
-    }
-    *((size_t *)res) = psz[0];
-    free(psz);
-    return GA_NO_ERROR;
+  case GA_CTX_PROP_UNIQUE_ID:
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "Can't get unique ID on OpenCL");
 
   case GA_CTX_PROP_LMEMSIZE:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(sz), &sz,
-                               NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    *((size_t *)res) = sz;
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_LOCAL_MEM_SIZE,
+                                       sizeof(ul), &ul, NULL));
+    *((size_t *)res) = clipto_sizet(ul);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_NUMPROCS:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(ui),
-                               &ui, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                       sizeof(ui), &ui, NULL));
     *((unsigned int *)res) = ui;
     return GA_NO_ERROR;
 
-  case GA_CTX_PROP_MAXGSIZE:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id,
-                                NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_ADDRESS_BITS, sizeof(ui), &ui,
-                               NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(sz),
-                               &sz, NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    if (ui == 32) {
-      sz = 4294967295UL/sz;
-    } else if (ui == 64) {
-      sz = 18446744073709551615ULL/sz;
-    } else {
-      assert(0 && "This should not be reached!");
-    }
-    *((size_t *)res) = sz;
-    return GA_NO_ERROR;
-
-  case GA_CTX_PROP_BLAS_OPS:
-#ifdef WITH_OPENCL_CLBLAS
-    *((gpuarray_blas_ops **)res) = &clblas_ops;
-    return GA_NO_ERROR;
-#else
-    *((void **)res) = NULL;
-    return GA_DEVSUP_ERROR;
-#endif
-
-  case GA_CTX_PROP_COMM_OPS:
-    // TODO Complete in the future whenif a multi-gpu collectives API for
-    // opencl appears
-    *((void **)res) = NULL;
-    return GA_DEVSUP_ERROR;
-
   case GA_CTX_PROP_BIN_ID:
     *((const char **)res) = ctx->bin_id;
     return GA_NO_ERROR;
@@ -1260,29 +1175,22 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_TOTAL_GMEM:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id,
-                                NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(sz), &sz,
-                               NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    *((size_t *)res) = sz;
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_GLOBAL_MEM_SIZE,
+                                       sizeof(ul), &ul, NULL));
+    *((size_t *)res) = clipto_sizet(ul);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_FREE_GMEM:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id,
-                                NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    /* XXX: This is not exaclty the amount of free memory but there is
-       no way to query that in the OpenCL API. */
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(sz),
-                               &sz, NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    *((size_t *)res) = sz;
+    /* There is no way to query free memory so we just return the
+        largest block size */
+  case GA_CTX_PROP_LARGEST_MEMBLOCK:
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                       sizeof(ul), &ul, NULL));
+    *((size_t *)res) = clipto_sizet(ul);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_NATIVE_FLOAT16:
@@ -1308,64 +1216,28 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXLSIZE0:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL,
-                               &sz);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    psz = malloc(sz);
-    if (psz == NULL)
-      return GA_MEMORY_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL);
-    if (ctx->err != CL_SUCCESS) {
-      free(psz);
-      return GA_IMPL_ERROR;
-    }
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                psz);
     *((size_t *)res) = psz[0];
     free(psz);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXLSIZE1:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL,
-                               &sz);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    psz = malloc(sz);
-    if (psz == NULL)
-      return GA_MEMORY_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL);
-    if (ctx->err != CL_SUCCESS) {
-      free(psz);
-      return GA_IMPL_ERROR;
-    }
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                psz);
     *((size_t *)res) = psz[1];
     free(psz);
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_MAXLSIZE2:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL,
-                               &sz);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
-    psz = malloc(sz);
-    if (psz == NULL)
-      return GA_MEMORY_ERROR;
-    ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL);
-    if (ctx->err != CL_SUCCESS) {
-      free(psz);
-      return GA_IMPL_ERROR;
-    }
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                psz);
     *((size_t *)res) = psz[2];
     free(psz);
     return GA_NO_ERROR;
@@ -1375,10 +1247,8 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   case GA_BUFFER_PROP_SIZE:
-    ctx->err = clGetMemObjectInfo(buf->buf, CL_MEM_SIZE, sizeof(sz), &sz,
-                                  NULL);
-    if (ctx->err != CL_SUCCESS)
-      return GA_IMPL_ERROR;
+    CL_CHECK(ctx->err, clGetMemObjectInfo(buf->buf, CL_MEM_SIZE, sizeof(sz),
+                                          &sz, NULL));
     *((size_t *)res) = sz;
     return GA_NO_ERROR;
 
@@ -1389,43 +1259,20 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   case GA_KERNEL_PROP_MAXLSIZE:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    ctx->err = clGetKernelWorkGroupInfo(k->k, id, CL_KERNEL_WORK_GROUP_SIZE,
-                                        sizeof(sz), &sz, NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_CHECK(ctx->err, clGetKernelWorkGroupInfo(k->k, id,
+                                                CL_KERNEL_WORK_GROUP_SIZE,
+                                                sizeof(sz), &sz, NULL));
     *((size_t *)res) = sz;
     return GA_NO_ERROR;
 
   case GA_KERNEL_PROP_PREFLSIZE:
-    ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id),
-                                &id, NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-#ifdef CL_VERSION_1_1
-    ctx->err = clGetKernelWorkGroupInfo(k->k, id,
-                                CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-                                        sizeof(sz), &sz, NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-#else
-    ctx->err = clGetKernelWorkGroupInfo(k->k, id, CL_KERNEL_WORK_GROUP_SIZE,
-                                        sizeof(sz), &sz, NULL);
-    if (ctx->err != GA_NO_ERROR)
-      return GA_IMPL_ERROR;
-    /*
-      This is sort of a guess, AMD generally has 64 and NVIDIA has 32.
-      Since this is a multiple, it would not hurt a lot to overestimate
-      unless we go over the maximum. However underestimating may hurt
-      performance due to the way we do the automatic allocation.
-
-      Also OpenCL 1.0 kind of sucks and this is only used for that.
-    */
-    sz = (sz < 64) ? sz : 64;
-#endif
+    CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES,
+                                        sizeof(id), &id, NULL));
+    CL_CHECK(ctx->err, clGetKernelWorkGroupInfo(k->k, id,
+                                                CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                                sizeof(sz), &sz, NULL));
     *((size_t *)res) = sz;
     return GA_NO_ERROR;
 
@@ -1438,20 +1285,20 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   default:
-    return GA_INVALID_ERROR;
+    return error_fmt(ctx->err, GA_INVALID_ERROR, "Invalid property: %d", prop_id);
   }
 }
 
 static const char *cl_error(gpucontext *c) {
   cl_ctx *ctx = (cl_ctx *)c;
-  if (ctx == NULL)
-    return get_error_string(err);
-  else
+  if (ctx == NULL){
+    return global_err->msg;
+  } else {
     ASSERT_CTX(ctx);
-    return get_error_string(ctx->err);
+    return ctx->err->msg;
+  }
 }
 
-GPUARRAY_LOCAL
 const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count,
                                         cl_get_device_count,
                                         cl_init,
@@ -1469,7 +1316,6 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count,
                                         cl_releasekernel,
                                         cl_setkernelarg,
                                         cl_callkernel,
-                                        cl_kernelbin,
                                         cl_sync,
                                         cl_transfer,
                                         cl_property,
diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c
index 3a15156c82..de80b715b6 100644
--- a/src/gpuarray_collectives_cuda_nccl.c
+++ b/src/gpuarray_collectives_cuda_nccl.c
@@ -2,7 +2,7 @@
 #include <limits.h>
 #include <stdlib.h>
 
-#include <nccl.h>
+#include "loaders/libnccl.h"
 
 #include "gpuarray/buffer_collectives.h"
 #include "gpuarray/config.h"
@@ -12,18 +12,21 @@
 #include "private.h"
 #include "private_cuda.h"
 
+static inline int error_nccl(error *e, const char *msg, ncclResult_t err) {
+  return error_fmt(e, GA_COMM_ERROR, "%s: %s", msg, ncclGetErrorString(err));
+}
+
 /**
  * Execute `cmd` and return appropriate code. Save a describing error message in
  * context.
  */
-#define NCCL_CHKFAIL(ctx, cmd)                         \
-  do {                                                 \
-    ncclResult_t nccl_err = (cmd);                     \
-    if (nccl_err != ncclSuccess) {                     \
-      (ctx)->error_msg = ncclGetErrorString(nccl_err); \
-      return GA_COMM_ERROR;                            \
-    }                                                  \
-    return GA_NO_ERROR;                                \
+#define NCCL_CHKFAIL(ctx, cmd)                  \
+  do {                                          \
+    ncclResult_t err = (cmd);                   \
+    if (err != ncclSuccess) {                   \
+      return error_nccl((ctx)->err, #cmd, err); \
+    }                                           \
+    return GA_NO_ERROR;                         \
   } while (0)
 
 /**
@@ -31,14 +34,13 @@
  * context. Exit from context and return \ref GA_COMM_ERROR if nccl does not
  * succeed.
  */
-#define NCCL_EXIT_ON_ERROR(ctx, cmd)                   \
-  do {                                                 \
-    ncclResult_t nccl_err = (cmd);                     \
-    if (nccl_err != ncclSuccess) {                     \
-      cuda_exit((ctx));                                \
-      (ctx)->error_msg = ncclGetErrorString(nccl_err); \
-      return GA_COMM_ERROR;                            \
-    }                                                  \
+#define NCCL_EXIT_ON_ERROR(ctx, cmd)            \
+  do {                                          \
+    ncclResult_t err = (cmd);                   \
+    if (err != ncclSuccess) {                   \
+      cuda_exit((ctx));                         \
+      return error_nccl((ctx)->err, #cmd, err); \
+    }                                           \
   } while (0)
 
 //!< Link wrapped cuda core operations
@@ -47,8 +49,6 @@ extern const gpuarray_buffer_ops cuda_ops;
 /**
  * Definition of struct _gpucomm
  *
- * Done here in order to avoid ifdefs concerning nccl's existance in core code.
- *
  * \note This must be the only "module" which manages the definition's contents.
  */
 struct _gpucomm {
@@ -59,11 +59,21 @@ struct _gpucomm {
 #endif
 };
 
+static int setup_done = 0;
+
+static int setup_lib(error *e) {
+  if (setup_done)
+    return GA_NO_ERROR;
+  GA_CHECK(load_libnccl(e));
+  setup_done = 1;
+  return GA_NO_ERROR;
+}
+
 /**
  * \brief Helper function to dereference a `comm`'s context and free memory
  */
-static void comm_clear(gpucomm* comm) {
-  cuda_ops.buffer_deinit((gpucontext*)comm->ctx);
+static void comm_clear(gpucomm *comm) {
+  gpucontext_deref((gpucontext *)comm->ctx);
   CLEAR(comm);
   free(comm);
 }
@@ -71,29 +81,31 @@ static void comm_clear(gpucomm* comm) {
 /**
  * \brief NCCL implementation of \ref gpucomm_new.
  */
-static int comm_new(gpucomm** comm_ptr, gpucontext* ctx,
+static int comm_new(gpucomm **comm_ptr, gpucontext *ctx,
                     gpucommCliqueId comm_id, int ndev, int rank) {
-  gpucomm* comm;
-  ncclResult_t nccl_err;
+  gpucomm *comm;
+  ncclResult_t err;
 
   ASSERT_CTX(ctx);
+
+  GA_CHECK(setup_lib(ctx->err));
+
   comm = calloc(1, sizeof(*comm));  // Allocate memory
   if (comm == NULL) {
     *comm_ptr = NULL;  // Set to NULL if failed
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   }
-  comm->ctx = (cuda_context*)ctx;  // convert to underlying cuda context
+  comm->ctx = (cuda_context *)ctx;  // convert to underlying cuda context
   // So that context would not be destroyed before communicator
   comm->ctx->refcnt++;
   cuda_enter(comm->ctx);  // Use device
-  nccl_err = ncclCommInitRank(&comm->c, ndev, *((ncclUniqueId*)&comm_id), rank);
+  err = ncclCommInitRank(&comm->c, ndev, *((ncclUniqueId *)&comm_id), rank);
   cuda_exit(comm->ctx);
   TAG_COMM(comm);
-  if (nccl_err != ncclSuccess) {
+  if (err != ncclSuccess) {
     *comm_ptr = NULL;  // Set to NULL if failed
     comm_clear(comm);
-    ctx->error_msg = ncclGetErrorString(nccl_err);
-    return GA_COMM_ERROR;
+    return error_nccl(ctx->err, "ncclCommInitRank", err);
   }
   *comm_ptr = comm;
   return GA_NO_ERROR;
@@ -102,7 +114,7 @@ static int comm_new(gpucomm** comm_ptr, gpucontext* ctx,
 /**
  * \brief NCCL implementation of \ref gpucomm_free.
  */
-static void comm_free(gpucomm* comm) {
+static void comm_free(gpucomm *comm) {
   ASSERT_COMM(comm);
   cuda_enter(comm->ctx);
   ncclCommDestroy(comm->c);
@@ -113,15 +125,17 @@ static void comm_free(gpucomm* comm) {
 /**
  * \brief NCCL implementation of \ref gpucomm_gen_clique_id.
  */
-static int generate_clique_id(gpucontext* c, gpucommCliqueId* comm_id) {
+static int generate_clique_id(gpucontext *c, gpucommCliqueId *comm_id) {
   ASSERT_CTX(c);
-  NCCL_CHKFAIL(c, ncclGetUniqueId((ncclUniqueId*)comm_id));
+
+  GA_CHECK(setup_lib(c->err));
+  NCCL_CHKFAIL(c, ncclGetUniqueId((ncclUniqueId *)comm_id));
 }
 
 /**
  * \brief NCCL implementation of \ref gpucomm_get_count.
  */
-static int get_count(const gpucomm* comm, int* gpucount) {
+static int get_count(const gpucomm *comm, int *gpucount) {
   ASSERT_COMM(comm);
   NCCL_CHKFAIL(comm->ctx, ncclCommCount(comm->c, gpucount));
 }
@@ -129,17 +143,17 @@ static int get_count(const gpucomm* comm, int* gpucount) {
 /**
  * \brief NCCL implementation of \ref gpucomm_get_rank.
  */
-static int get_rank(const gpucomm* comm, int* rank) {
+static int get_rank(const gpucomm *comm, int *rank) {
   ASSERT_COMM(comm);
   NCCL_CHKFAIL(comm->ctx, ncclCommUserRank(comm->c, rank));
 }
 
 /**
- * \brief Helper function to try to convert \ref enum _gpucomm_reduce_ops to
+ * \brief Helper function to try to convert \ref enum gpucomm_reduce_ops to
  * \ref
  * ncclRedOp_t.
  *
- * If invalid, return `nccl_NUM_OPS`.
+ * If invalid, return `ncclNumOps`.
  */
 static inline ncclRedOp_t convert_reduce_op(int opcode) {
   switch (opcode) {
@@ -148,28 +162,26 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) {
   case GA_MAX: return ncclMax;
   case GA_MIN: return ncclMin;
   }
-  return nccl_NUM_OPS;
+  return ncclNumOps;
 }
 
 /**
  * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref
  * ncclDataType_t.
  *
- * If invalid, return `nccl_NUM_TYPES`.
+ * If invalid, return `ncclNumTypes`.
  */
 static inline ncclDataType_t convert_data_type(int typecode) {
   switch (typecode) {
   case GA_BYTE: return ncclChar;
   case GA_INT: return ncclInt;
-#ifdef CUDA_HAS_HALF
-  case GA_HALF: return ncclHalf;
-#endif  // CUDA_HAS_HALF
   case GA_FLOAT: return ncclFloat;
   case GA_DOUBLE: return ncclDouble;
   case GA_LONG: return ncclInt64;
   case GA_ULONG: return ncclUint64;
+  case GA_HALF: return ncclHalf;
   }
-  return nccl_NUM_TYPES;
+  return ncclNumTypes;
 }
 
 /**
@@ -177,32 +189,33 @@ static inline ncclDataType_t convert_data_type(int typecode) {
  * nccl
  * collective operations.
  */
-static inline int check_restrictions(gpudata* src, size_t offsrc, gpudata* dest,
-                                     size_t offdest, size_t count, int typecode,
-                                     int opcode, gpucomm* comm,
-                                     ncclDataType_t* datatype,
-                                     ncclRedOp_t* op) {
+static inline int check_restrictions(gpudata *src, size_t offsrc,
+                                     gpudata *dest, size_t offdest,
+                                     size_t count, int typecode,
+                                     int opcode, gpucomm *comm,
+                                     ncclDataType_t *datatype,
+                                     ncclRedOp_t *op) {
   size_t op_size;
   // Check if count is larger than INT_MAX
   // TODO remove whenif nccl adapts to size_t
   if (count > INT_MAX)
-    return GA_UNSUPPORTED_ERROR;
+    return error_set(comm->ctx->err, GA_XLARGE_ERROR, "Count too large for int");
   // src, dest and comm must refer to the same context
   if (src->ctx != comm->ctx)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "source and comm context differ");
   if (dest != NULL && dest->ctx != comm->ctx)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ");
   // typecode must correspond to a valid ncclDataType_t
   if (datatype != NULL) {
     *datatype = convert_data_type(typecode);
-    if (*datatype == nccl_NUM_TYPES)
-      return GA_INVALID_ERROR;
+    if (*datatype == ncclNumTypes)
+      return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type");
   }
   // opcode must correspond to a valid ncclRedOp_t
   if (op != NULL) {
     *op = convert_reduce_op(opcode);
-    if (*op == nccl_NUM_OPS)
-      return GA_INVALID_ERROR;
+    if (*op == ncclNumOps)
+      return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op");
   }
   // offsets must not be larger than gpudata's size itself
   // (else out of alloc-ed mem scope)
@@ -211,23 +224,24 @@ static inline int check_restrictions(gpudata* src, size_t offsrc, gpudata* dest,
   // size to operate upon must be able to fit inside the gpudata (incl offsets)
   op_size = count * gpuarray_get_elsize(typecode);
   if ((src->sz - offsrc) < op_size)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "source too small for operation");
   if (dest != NULL && (dest->sz - offdest) < op_size)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation");
   return GA_NO_ERROR;
 }
 
 /**
  * \brief NCCL implementation of \ref gpucomm_reduce.
  */
-static int reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest,
+static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest,
                   size_t count, int typecode, int opcode, int root,
-                  gpucomm* comm) {
-  ncclRedOp_t op;
-  ncclDataType_t datatype;
-  gpudata* dst = NULL;
+                  gpucomm *comm) {
+  // need dummy init so that compiler shuts up
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
+  gpudata *dst = NULL;
   int rank = 0;
-  cuda_context* ctx;
+  cuda_context *ctx;
 
   ASSERT_BUF(src);
   ASSERT_COMM(comm);
@@ -250,11 +264,11 @@ static int reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest,
 
   // change stream of nccl ops to enable concurrency
   if (rank == root)
-    NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void*)(src->ptr + offsrc),
-                                       (void*)(dest->ptr + offdest), count,
+    NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc),
+                                       (void *)(dest->ptr + offdest), count,
                                        datatype, op, root, comm->c, ctx->s));
   else
-    NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void*)(src->ptr + offsrc), NULL, count,
+    NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc), NULL, count,
                                        datatype, op, root, comm->c, ctx->s));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
@@ -269,12 +283,13 @@ static int reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest,
 /**
  * \brief NCCL implementation of \ref gpucomm_all_reduce.
  */
-static int all_reduce(gpudata* src, size_t offsrc, gpudata* dest,
+static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode, int opcode,
-                      gpucomm* comm) {
-  ncclRedOp_t op;
-  ncclDataType_t datatype;
-  cuda_context* ctx;
+                      gpucomm *comm) {
+  // need dummy init so that compiler shuts up
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
+  cuda_context *ctx;
 
   ASSERT_BUF(src);
   ASSERT_COMM(comm);
@@ -291,8 +306,8 @@ static int all_reduce(gpudata* src, size_t offsrc, gpudata* dest,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));
 
   // change stream of nccl ops to enable concurrency
-  NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void*)(src->ptr + offsrc),
-                                        (void*)(dest->ptr + offdest), count,
+  NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc),
+                                        (void *)(dest->ptr + offdest), count,
                                         datatype, op, comm->c, ctx->s));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
@@ -306,14 +321,15 @@ static int all_reduce(gpudata* src, size_t offsrc, gpudata* dest,
 /**
  * \brief NCCL implementation of \ref gpucomm_reduce_scatter.
  */
-static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest,
+static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
                           size_t offdest, size_t count, int typecode,
-                          int opcode, gpucomm* comm) {
-  ncclRedOp_t op;
-  ncclDataType_t datatype;
+                          int opcode, gpucomm *comm) {
+  // need dummy init so that compiler shuts up
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
-  cuda_context* ctx;
+  cuda_context *ctx;
 
   ASSERT_BUF(src);
   ASSERT_COMM(comm);
@@ -322,10 +338,10 @@ static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest,
   GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode,
                               opcode, comm, &datatype, &op));
   if (dest->ctx != comm->ctx)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ");
   resc_size = count * gpuarray_get_elsize(typecode);
   if ((dest->sz - offdest) < resc_size)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation");
   assert(!(offdest > dest->sz));
 
   ctx = comm->ctx;
@@ -337,8 +353,8 @@ static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));
 
   // change stream of nccl ops to enable concurrency
-  NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void*)(src->ptr + offsrc),
-                                            (void*)(dest->ptr + offdest), count,
+  NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc),
+                                            (void *)(dest->ptr + offdest), count,
                                             datatype, op, comm->c, ctx->s));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
@@ -352,11 +368,12 @@ static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest,
 /**
  * \brief NCCL implementation of \ref gpucomm_broadcast.
  */
-static int broadcast(gpudata* array, size_t offset, size_t count, int typecode,
-                     int root, gpucomm* comm) {
-  ncclDataType_t datatype;
+static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
+                     int root, gpucomm *comm) {
+  // need dummy init so that compiler shuts up
+  ncclDataType_t datatype = ncclNumTypes;
   int rank = 0;
-  cuda_context* ctx;
+  cuda_context *ctx;
 
   ASSERT_BUF(array);
   ASSERT_COMM(comm);
@@ -374,7 +391,7 @@ static int broadcast(gpudata* array, size_t offset, size_t count, int typecode,
     GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE));
 
   // change stream of nccl ops to enable concurrency
-  NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void*)(array->ptr + offset), count,
+  NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count,
                                     datatype, root, comm->c, ctx->s));
 
   if (rank == root)
@@ -390,13 +407,14 @@ static int broadcast(gpudata* array, size_t offset, size_t count, int typecode,
 /**
  * \brief NCCL implementation of \ref gpucomm_all_gather.
  */
-static int all_gather(gpudata* src, size_t offsrc, gpudata* dest,
+static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode,
-                      gpucomm* comm) {
-  ncclDataType_t datatype;
+                      gpucomm *comm) {
+  // need dummy init so that compiler shuts up
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
-  cuda_context* ctx;
+  cuda_context *ctx;
 
   ASSERT_BUF(src);
   ASSERT_COMM(comm);
@@ -404,11 +422,11 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest,
   GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count, typecode, 0, comm,
                               &datatype, NULL));
   if (dest->ctx != comm->ctx)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ");
   GA_CHECK(get_count(comm, &ndev));
   resc_size = ndev * count * gpuarray_get_elsize(typecode);
   if ((dest->sz - offdest) < resc_size)
-    return GA_VALUE_ERROR;
+    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation");
   assert(!(offdest > dest->sz));
 
   ctx = comm->ctx;
@@ -421,8 +439,8 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest,
 
   // change stream of nccl ops to enable concurrency
   NCCL_EXIT_ON_ERROR(
-      ctx, ncclAllGather((void*)(src->ptr + offsrc), count, datatype,
-                         (void*)(dest->ptr + offdest), comm->c, ctx->s));
+      ctx, ncclAllGather((void *)(src->ptr + offsrc),
+			 (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));
@@ -437,6 +455,6 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest,
  * linked in \ref gpuarray_buffer_cuda.c, in order to fill a /ref gpucontext's
  * comm_ops.
  */
-GPUARRAY_LOCAL gpuarray_comm_ops nccl_ops = {
+gpuarray_comm_ops nccl_ops = {
     comm_new, comm_free,  generate_clique_id, get_count, get_rank,
     reduce,   all_reduce, reduce_scatter,     broadcast, all_gather};
diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c
index 14a1db5556..776b386fcd 100644
--- a/src/gpuarray_elemwise.c
+++ b/src/gpuarray_elemwise.c
@@ -31,6 +31,7 @@ struct _GpuElemwise {
 STATIC_ASSERT(GEN_CONVERT_F16 == GE_CONVERT_F16, same_flags_value_elem1);
 
 #define is_array(a) (ISCLR((a).flags, GE_SCALAR))
+#define is_output(a) (ISSET((a).flags, GE_WRITE))
 
 static inline int k_initialized(GpuKernel *k) {
   return k->k != NULL;
@@ -130,9 +131,9 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
   strb sb = STRB_STATIC_INIT;
   unsigned int i, _i, j;
   int *ktypes;
-  size_t p;
   char *size = "ga_size", *ssize = "ga_ssize";
-  int flags = GA_USE_CLUDA;
+  unsigned int p;
+  int flags = 0;
   int res;
 
   if (ISSET(gen_flags, GEN_ADDR32)) {
@@ -149,10 +150,11 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
 
   ktypes = calloc(p, sizeof(int));
   if (ktypes == NULL)
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
 
   p = 0;
 
+  strb_appends(&sb, "#include \"cluda.h\"\n");
   if (preamble)
     strb_appends(&sb, preamble);
   strb_appends(&sb, "\nKERNEL void elem(const ga_size n, ");
@@ -175,7 +177,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
         ktypes[p++] = GA_SSIZE;
       }
     } else {
-      strb_appendf(&sb, "%s %s", ctype(args[i].typecode), args[j].name);
+      strb_appendf(&sb, "%s %s", ctype(args[j].typecode), args[j].name);
       ktypes[p++] = args[j].typecode;
     }
     if (j != (n - 1)) strb_appends(&sb, ", ");
@@ -211,7 +213,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
                                         GA_FLOAT : args[j].typecode), args[j].name);
       if (ISSET(args[j].flags, GE_READ)) {
         if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
-          strb_appendf(&sb, "%s = load_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n",
+          strb_appendf(&sb, "%s = ga_half2float(*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n",
                        args[j].name, args[j].name, args[j].name);
         } else {
           strb_appendf(&sb, "%s = *(GLOBAL_MEM %s *)(((GLOBAL_MEM char *)%s_data) + %s_p);\n",
@@ -225,7 +227,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
   for (j = 0; j < n; j++) {
     if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) {
       if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
-        strb_appendf(&sb, "store_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p), %s);\n",
+        strb_appendf(&sb, "*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p) = ga_float2half(%s);\n",
                      args[j].name, args[j].name, args[j].name);
       } else {
         strb_appendf(&sb, "*(GLOBAL_MEM %s *)(((GLOBAL_MEM char *)%s_data) + %s_p) = %s;\n",
@@ -271,50 +273,62 @@ static int check_basic(GpuElemwise *ge, void **args, int flags,
                        size_t *_n, unsigned int *_nd, size_t **_dims,
                        ssize_t ***_strides, int *_call32) {
   size_t n;
+  gpucontext *ctx = GpuKernel_context(&ge->k_contig);
   GpuArray *a = NULL, *v;
   unsigned int i, j, p, num_arrays = 0, nd = 0, nnd;
   int call32 = 1;
+  unsigned int nd_i = 0;
+  size_t v_dim_j = 0;
 
   /* Go through the list and grab some info */
   for (i = 0; i < ge->n; i++) {
     if (is_array(ge->args[i])) {
-      num_arrays++;
-      if (a == NULL) {
-        a = (GpuArray *)args[i];
-        nd = a->nd;
+      nd_i = ((GpuArray *)args[i])->nd;
+      if (num_arrays == 0)
+        nd = nd_i;
+      else if (nd_i != nd) {
+        if (flags & GE_PADSHAPE)
+          nd = nd_i > nd ? nd_i : nd;
+        else
+          return error_fmt(ctx->err, GA_VALUE_ERROR, "Arg %u has differing nd = %u", i, nd_i);
       }
-      if (((GpuArray *)args[i])->nd != nd)
-        return GA_VALUE_ERROR;
+      ++num_arrays;
+      if (a == NULL && is_output(ge->args[i]))
+        a = (GpuArray *)args[i];
     }
   }
 
   if (a == NULL)
-    return GA_VALUE_ERROR;
+    return error_set(ctx->err, GA_VALUE_ERROR, "No output arrays");
 
   /* Check if we need to grow the internal buffers */
   if (nd > ge->nd) {
     nnd = ge->nd * 2;
     while (nd > nnd) nnd *= 2;
     if (ge_grow(ge, nnd))
-      return GA_MEMORY_ERROR;
+      return error_sys(ctx->err, "ge_grow");
   }
 
-  /* Now we know that all array arguments have the same number of
-     dimensions */
+  /* Now we know that all array arguments have at most nd
+     dimensions and that the expected output size is the size of a */
 
   /* And copy their initial values in */
   memcpy(ge->dims, a->dimensions, nd*sizeof(size_t));
   p = 0;
   for (i = 0; i < ge->n; i++) {
     if (is_array(ge->args[i])) {
-      memcpy(ge->strides[p], ((GpuArray *)args[i])->strides, nd*sizeof(ssize_t));
+      /* Left-pad strides with zero on implicitly broadcasted dimensions */
+      memset(ge->strides[p], 0, nd*sizeof(ssize_t));
+      nd_i = ((GpuArray *)args[i])->nd;
+      memcpy((char *)(ge->strides[p]) + (nd - nd_i)*sizeof(ssize_t),
+             ((GpuArray *)args[i])->strides, nd_i*sizeof(ssize_t));
       p++;
     }
   }
 
   /* Check that all arrays are the same size (or broadcast-compatible
-     if GE_BROADCAST).  Also compute the total size and adjust strides
-     of broadcastable dimensions.
+     if GE_BROADCAST), adjust strides of broadcastable dimensions and
+     check if we can use the 32 bit address version.
 
      Basically for each dimension go over all the arguments and make
      sure that the dimension size matches. */
@@ -324,22 +338,23 @@ static int check_basic(GpuElemwise *ge, void **args, int flags,
     for (i = 0; i < ge->n; i++) {
       if (is_array(ge->args[i])) {
         v = (GpuArray *)args[i];
-        if (ge->dims[j] != v->dimensions[j]) {
-          if (ISCLR(flags, GE_BROADCAST)) {
-            return GA_VALUE_ERROR;
-          }
-          /* GE_BROADCAST is set */
-          if (ge->dims[j] == 1) {
-            ge->dims[j] = v->dimensions[j];
-          } else {
-            if (v->dimensions[j] != 1) {
-              return GA_VALUE_ERROR;
-            }
+        nd_i = v->nd;
+        /* Pad shape with 1 if needed for implicitly broadcasted dimensions
+           and shift if needed */
+        if (j < nd - nd_i)
+          v_dim_j = 1;
+        else
+          v_dim_j = v->dimensions[j - (nd - nd_i)];
+        if (ge->dims[j] != v_dim_j) {
+          /* We can't broadcast outputs */
+          if (ISCLR(flags, GE_BROADCAST) || is_output(ge->args[i]) ||
+              v_dim_j != 1) {
+            return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u (expected %" SPREFIX "u got %" SPREFIX "u)", j, i, ge->dims[j], v_dim_j);
           }
         }
         /* If the dimension is 1 set the strides to 0 regardless since
            it won't change anything in the non-broadcast case. */
-        if (v->dimensions[j] == 1) {
+        if (v_dim_j == 1) {
           ge->strides[p][j] = 0;
         }
         call32 &= v->offset < ADDR32_MAX;
@@ -371,10 +386,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd,
                       size_t *dims, ssize_t **strs, int call32) {
   GpuKernel *k;
   size_t ls = 0, gs = 0;
-  unsigned int p = 0, i, j;
+  unsigned int p = 0, i, j, l;
   int err;
 
-  if (nd == 0) return GA_VALUE_ERROR;
+  if (nd == 0) return error_set(GpuKernel_context(&ge->k_contig)->err, GA_VALUE_ERROR, "nd == 0");
 
   if (call32)
     k = &ge->k_basic_32[nd-1];
@@ -391,35 +406,38 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd,
   }
 
   err = GpuKernel_setarg(k, p++, &n);
-  if (err != GA_NO_ERROR) goto error;
+  if (err != GA_NO_ERROR) goto error_call_basic;
 
   for (i = 0; i < nd; i++) {
     err = GpuKernel_setarg(k, p++, &dims[i]);
-    if (err != GA_NO_ERROR) goto error;
+    if (err != GA_NO_ERROR) goto error_call_basic;
   }
 
+  /* l is the number of arrays to date */
+  l = 0;
   for (j = 0; j < ge->n; j++) {
     if (is_array(ge->args[j])) {
       GpuArray *v = (GpuArray *)args[j];
       err = GpuKernel_setarg(k, p++, v->data);
-      if (err != GA_NO_ERROR) goto error;
+      if (err != GA_NO_ERROR) goto error_call_basic;
       err = GpuKernel_setarg(k, p++, &v->offset);
-      if (err != GA_NO_ERROR) goto error;
+      if (err != GA_NO_ERROR) goto error_call_basic;
       for (i = 0; i < nd; i++) {
-        err = GpuKernel_setarg(k, p++, &strs[j][i]);
-        if (err != GA_NO_ERROR) goto error;
+        err = GpuKernel_setarg(k, p++, &strs[l][i]);
+        if (err != GA_NO_ERROR) goto error_call_basic;
       }
+      l++;
     } else {
       err = GpuKernel_setarg(k, p++, args[j]);
-      if (err != GA_NO_ERROR) goto error;
+      if (err != GA_NO_ERROR) goto error_call_basic;
     }
   }
 
-  err = GpuKernel_sched(k, n, &ls, &gs);
-  if (err != GA_NO_ERROR) goto error;
+  err = GpuKernel_sched(k, n, &gs, &ls);
+  if (err != GA_NO_ERROR) goto error_call_basic;
 
-  err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);
- error:
+  err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);
+ error_call_basic:
   return err;
 }
 
@@ -434,8 +452,8 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
   int *ktypes = NULL;
   unsigned int p;
   unsigned int j;
-  int flags = GA_USE_CLUDA;
-  int res = GA_MEMORY_ERROR;
+  int flags = 0;
+  int res;
 
   flags |= gpuarray_type_flagsa(n, args);
 
@@ -444,11 +462,14 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
     p += ISSET(args[j].flags, GE_SCALAR) ? 1 : 2;
 
   ktypes = calloc(p, sizeof(int));
-  if (ktypes == NULL)
+  if (ktypes == NULL) {
+    res = error_sys(ctx->err, "calloc");
     goto bail;
+  }
 
   p = 0;
 
+  strb_appends(&sb, "#include \"cluda.h\"\n");
   if (preamble)
     strb_appends(&sb, preamble);
   strb_appends(&sb, "\nKERNEL void elem(const ga_size n, ");
@@ -487,7 +508,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
                                           GA_FLOAT : args[j].typecode), args[j].name);
       if (ISSET(args[j].flags, GE_READ)) {
         if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
-          strb_appendf(&sb, "%s = load_half(&%s_p[i]);\n", args[j].name, args[j].name);
+          strb_appendf(&sb, "%s = ga_half2float(%s_p[i]);\n", args[j].name, args[j].name);
         } else {
           strb_appendf(&sb, "%s = %s_p[i];\n", args[j].name, args[j].name);
         }
@@ -501,7 +522,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
     if (is_array(args[j])) {
       if (ISSET(args[j].flags, GE_WRITE)) {
         if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) {
-          strb_appendf(&sb, "store_half(&%s_p[i], %s);\n", args[j].name, args[j].name);
+          strb_appendf(&sb, "%s_p[i] = ga_float2half(%s);\n", args[j].name, args[j].name);
         } else {
           strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name);
         }
@@ -510,8 +531,10 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
   }
   strb_appends(&sb, "}\n}\n");
 
-  if (strb_error(&sb))
+  if (strb_error(&sb)) {
+    res = error_set(ctx->err, GA_MISC_ERROR, "Formatting error creating kernel source");
     goto bail;
+  }
 
   res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "elem",
                        p, ktypes, flags, err_str);
@@ -539,10 +562,10 @@ static int check_contig(GpuElemwise *ge, void **args,
       f_contig &= GpuArray_IS_F_CONTIGUOUS(v);
       if (a != v) {
         if (a->nd != v->nd)
-          return GA_INVALID_ERROR;
+          return -1; /* We don't check the value of the error code */
         for (j = 0; j < a->nd; j++) {
           if (v->dimensions[j] != a->dimensions[j])
-            return GA_VALUE_ERROR;
+            return -1; /* We don't check the value of the error code */
         }
       }
     }
@@ -573,9 +596,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) {
       if (err != GA_NO_ERROR) return err;
     }
   }
-  err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);
+  err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);
   if (err != GA_NO_ERROR) return err;
-  return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);
+  return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);
 }
 
 GpuElemwise *GpuElemwise_new(gpucontext *ctx,
@@ -590,24 +613,33 @@ GpuElemwise *GpuElemwise_new(gpucontext *ctx,
   int ret;
 
   res = calloc(1, sizeof(*res));
-  if (res == NULL) return NULL;
+  if (res == NULL) {
+    error_sys(ctx->err, "calloc");
+    return NULL;
+  }
 
   res->flags = flags;
   res->nd = 8;
   res->n = n;
 
   res->expr = strdup(expr);
-  if (res->expr == NULL)
+  if (res->expr == NULL) {
+    error_sys(ctx->err, "strdup");
     goto fail;
+  }
   if (preamble != NULL) {
     res->preamble = strdup(preamble);
-    if (res->preamble == NULL)
+    if (res->preamble == NULL) {
+      error_sys(ctx->err, "strdup");
       goto fail;
+    }
   }
 
   res->args = copy_args(n, args);
-  if (res->args == NULL)
+  if (res->args == NULL) {
+    error_sys(ctx->err, "copy_args");
     goto fail;
+  }
 
   /* Count the arrays in the arguements */
   res->narray = 0;
@@ -616,18 +648,26 @@ GpuElemwise *GpuElemwise_new(gpucontext *ctx,
 
   while (res->nd < nd) res->nd *= 2;
   res->dims = calloc(res->nd, sizeof(size_t));
-  if (res->dims == NULL)
+  if (res->dims == NULL) {
+    error_sys(ctx->err, "calloc");
     goto fail;
+  }
   res->strides = strides_array(res->narray, res->nd);
-  if (res->strides == NULL)
+  if (res->strides == NULL) {
+    error_sys(ctx->err, "strides_array");
     goto fail;
+  }
   res->k_basic = calloc(res->nd, sizeof(GpuKernel));
-  if (res->k_basic == NULL)
+  if (res->k_basic == NULL) {
+    error_sys(ctx->err, "calloc");
     goto fail;
+  }
 
   res->k_basic_32 = calloc(res->nd, sizeof(GpuKernel));
-  if (res->k_basic_32 == NULL)
+  if (res->k_basic_32 == NULL) {
+    error_sys(ctx->err, "calloc");
     goto fail;
+  }
 
   ret = gen_elemwise_contig_kernel(&res->k_contig, ctx,
 #ifdef DEBUG
@@ -698,18 +738,24 @@ GpuElemwise *GpuElemwise_new(gpucontext *ctx,
 
 void GpuElemwise_free(GpuElemwise *ge) {
   unsigned int i;
-  for (i = 0; i < ge->nd; i++) {
-    if (k_initialized(&ge->k_basic_32[i]))
-      GpuKernel_clear(&ge->k_basic_32[i]);
-    if (k_initialized(&ge->k_basic[i]))
-      GpuKernel_clear(&ge->k_basic[i]);
-  }
+  if (ge->k_basic_32 != NULL)
+    for (i = 0; i < ge->nd; i++) {
+      if (k_initialized(&ge->k_basic_32[i]))
+        GpuKernel_clear(&ge->k_basic_32[i]);
+    }
+  if (ge->k_basic != NULL)
+    for (i = 0; i < ge->nd; i++) {
+      if (k_initialized(&ge->k_basic[i]))
+        GpuKernel_clear(&ge->k_basic[i]);
+    }
   if (ge->strides != NULL)
     for (i = 0; i < ge->narray; i++) {
       free(ge->strides[i]);
     }
   if (k_initialized(&ge->k_contig))
     GpuKernel_clear(&ge->k_contig);
+  free(ge->k_basic_32);
+  free(ge->k_basic);
   free_args(ge->n, ge->args);
   free((void *)ge->preamble);
   free((void *)ge->expr);
@@ -719,12 +765,12 @@ void GpuElemwise_free(GpuElemwise *ge) {
 }
 
 int GpuElemwise_call(GpuElemwise *ge, void **args, int flags) {
-  size_t n;
-  size_t *dims;
-  ssize_t **strides;
-  unsigned int nd;
-  int contig;
-  int call32;
+  size_t n = 0;
+  size_t *dims = NULL;
+  ssize_t **strides = NULL;
+  unsigned int nd = 0;
+  int contig = 0;
+  int call32 = 0;
   int err;
 
   err = check_contig(ge, args, &n, &contig);
diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c
index 5194a2af03..b7d5011f5b 100644
--- a/src/gpuarray_error.c
+++ b/src/gpuarray_error.c
@@ -23,6 +23,8 @@ const char *gpuarray_error_str(int err) {
   case GA_NODEV_ERROR:       return "No devices are available";
   case GA_MISC_ERROR:        return "Undeterminate error";
   case GA_COMM_ERROR:        return "Error in collectives call";
+  case GA_XLARGE_ERROR:      return "Input size too large for operation";
+  case GA_LOAD_ERROR:        return "Error loading library";
   default: return "Unknown GA error";
   }
 }
diff --git a/src/gpuarray_extension.c b/src/gpuarray_extension.c
index 73d63ab656..e120d83b88 100644
--- a/src/gpuarray_extension.c
+++ b/src/gpuarray_extension.c
@@ -7,7 +7,6 @@ typedef struct _ext {
   void *val;
 } ext;
 
-#ifdef WITH_CUDA
 extern void cuda_enter(void);
 extern void cuda_exit(void);
 extern void *cuda_make_ctx(void);
@@ -16,16 +15,15 @@ extern void *cuda_make_buf(void);
 extern void *cuda_get_sz(void);
 extern void *cuda_wait(void);
 extern void *cuda_record(void);
-#endif
-#ifdef WITH_OPENCL
+extern void *cuda_get_ipc_handle(void);
+extern void *cuda_open_ipc_handle(void);
+
 extern void *cl_make_ctx(void);
 extern void *cl_get_stream(void);
 extern void *cl_make_buf(void);
 extern void *cl_get_buf(void);
-#endif
 
 static ext ext_list[] = {
-#ifdef WITH_CUDA
   {"cuda_enter", cuda_enter},
   {"cuda_exit", cuda_exit},
   {"cuda_make_ctx", cuda_make_ctx},
@@ -34,13 +32,13 @@ static ext ext_list[] = {
   {"cuda_get_sz", cuda_get_sz},
   {"cuda_wait", cuda_wait},
   {"cuda_record", cuda_record},
-#endif
-#ifdef WITH_OPENCL
+  {"cuda_get_ipc_handle", cuda_get_ipc_handle},
+  {"cuda_open_ipc_handle", cuda_open_ipc_handle},
+
   {"cl_make_ctx", cl_make_ctx},
   {"cl_get_stream", cl_get_stream},
   {"cl_make_buf", cl_make_buf},
   {"cl_get_buf", cl_get_buf},
-#endif
 };
 
 #define N_EXT (sizeof(ext_list)/sizeof(ext_list[0]))
diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c
index 8beea94150..818187cca0 100644
--- a/src/gpuarray_kernel.c
+++ b/src/gpuarray_kernel.c
@@ -2,6 +2,9 @@
 #include "gpuarray/error.h"
 #include "gpuarray/types.h"
 
+#include "util/error.h"
+#include "private.h"
+
 #include <stdlib.h>
 
 int GpuKernel_init(GpuKernel *k, gpucontext *ctx, unsigned int count,
@@ -12,7 +15,7 @@ int GpuKernel_init(GpuKernel *k, gpucontext *ctx, unsigned int count,
 
   k->args = calloc(argcount, sizeof(void *));
   if (k->args == NULL)
-    return GA_MEMORY_ERROR;
+    return error_sys(ctx->err, "calloc");
   k->k = gpukernel_init(ctx, count, strs, lens, name, argcount, types,
                         flags, &res, err_str);
   if (res != GA_NO_ERROR)
@@ -32,7 +35,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) {
   return gpukernel_context(k->k);
 }
 
-int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
+int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) {
   size_t min_l;
   size_t max_l;
   size_t target_l;
@@ -51,7 +54,7 @@ int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
   err = gpukernel_property(k->k, GA_CTX_PROP_NUMPROCS, &numprocs);
   if (err != GA_NO_ERROR)
     return err;
-  err = gpukernel_property(k->k, GA_CTX_PROP_MAXGSIZE, &max_g);
+  err = gpukernel_property(k->k, GA_CTX_PROP_MAXGSIZE0, &max_g);
   if (err != GA_NO_ERROR)
     return err;
 
@@ -90,13 +93,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) {
 }
 
 int GpuKernel_call(GpuKernel *k, unsigned int n,
-                   const size_t *bs, const size_t *gs,
+                   const size_t *gs, const size_t *ls,
                    size_t shared, void **args) {
-  return gpukernel_call(k->k, n, bs, gs, shared, args);
-}
-
-int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {
-  return gpukernel_binary(k->k, sz, bin);
+  return gpukernel_call(k->k, n, gs, ls, shared, args);
 }
 
 const char *GpuKernel_error(const GpuKernel *k, int err) {
diff --git a/src/gpuarray_mkstemp.c b/src/gpuarray_mkstemp.c
index ac5ea10940..5e2e8ca520 100644
--- a/src/gpuarray_mkstemp.c
+++ b/src/gpuarray_mkstemp.c
@@ -8,6 +8,8 @@
 #include <io.h>
 #define open _open
 #define mktemp _mktemp
+#else
+#define O_BINARY 0
 #endif
 
 int mkstemp(char *path) {
@@ -18,7 +20,7 @@ int mkstemp(char *path) {
     do {
         tmp = mktemp(path);
         if (tmp == NULL) return -1;
-        res = open(path, O_CREAT|O_EXCL|O_RDWR, S_IREAD|S_IWRITE);
+        res = open(path, O_CREAT|O_EXCL|O_RDWR|O_BINARY, S_IREAD|S_IWRITE);
         if (res != -1 || errno != EEXIST)
             return res;
     } while (--tries);
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
new file mode 100644
index 0000000000..fc4fc56975
--- /dev/null
+++ b/src/gpuarray_reduction.c
@@ -0,0 +1,847 @@
+/* Includes */
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include "gpuarray/config.h"
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "private.h"
+#include "gpuarray/array.h"
+#include "gpuarray/error.h"
+#include "gpuarray/kernel.h"
+#include "gpuarray/util.h"
+
+#include "util/strb.h"
+#include "util/integerfactoring.h"
+
+
+/* Datatypes */
+struct maxandargmax_ctx{
+	/* Function Arguments. */
+	GpuArray*       dstMax;
+	GpuArray*       dstArgmax;
+	const GpuArray* src;
+	int             reduxLen;
+	const int*      reduxList;
+
+	/* General. */
+	int             ret;
+	int*            axisList;
+	gpucontext*     gpuCtx;
+
+	/* Source code Generator. */
+	const char*     dstMaxType;
+	const char*     dstArgmaxType;
+	int             ndd;
+	int             ndr;
+	int             nds;
+	int             ndh;
+	strb            s;
+	char*           sourceCode;
+	GpuKernel       kernel;
+
+	/* Scheduler */
+	int             hwAxisList[3];
+	size_t          blockSize [3];
+	size_t          gridSize  [3];
+	size_t          chunkSize [3];
+
+	/* Invoker */
+	gpudata*        srcStepsGD;
+	gpudata*        srcSizeGD;
+	gpudata*        chunkSizeGD;
+	gpudata*        dstMaxStepsGD;
+	gpudata*        dstArgmaxStepsGD;
+};
+typedef struct maxandargmax_ctx maxandargmax_ctx;
+
+
+
+/* Function prototypes */
+static int   axisInSet                          (int                v,
+                                                 const int*         set,
+                                                 size_t             setLen,
+                                                 size_t*            where);
+static void  appendIdxes                        (strb*              s,
+                                                 const char*        prologue,
+                                                 const char*        prefix,
+                                                 int                startIdx,
+                                                 int                endIdx,
+                                                 const char*        suffix,
+                                                 const char*        epilogue);
+static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx);
+static int   maxandargmaxSelectHwAxes           (maxandargmax_ctx*  ctx);
+static int   maxandargmaxGenSource              (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendKernel           (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendPrototype        (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendOffsets          (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendLoops            (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx);
+static void  maxandargmaxAppendLoopMacroUndefs  (maxandargmax_ctx*  ctx);
+static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx);
+static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx);
+static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx);
+static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx);
+static int   maxandargmaxCleanup                (maxandargmax_ctx*  ctx);
+
+
+/* Function implementation */
+GPUARRAY_PUBLIC int GpuArray_maxandargmax       (GpuArray*       dstMax,
+                                                 GpuArray*       dstArgmax,
+                                                 const GpuArray* src,
+                                                 unsigned        reduxLen,
+                                                 const unsigned* reduxList){
+	maxandargmax_ctx  ctxSTACK = {0};
+	maxandargmax_ctx  *ctx = &ctxSTACK;
+
+  ctxSTACK.dstMax = dstMax;
+	ctxSTACK.dstArgmax = dstArgmax;
+	ctxSTACK.src = src;
+	ctxSTACK.reduxLen = (int)reduxLen;
+	ctxSTACK.reduxList = (const int*)reduxList;
+
+	if(maxandargmaxCheckargs   (ctx) == GA_NO_ERROR &&
+	   maxandargmaxSelectHwAxes(ctx) == GA_NO_ERROR &&
+	   maxandargmaxGenSource   (ctx) == GA_NO_ERROR &&
+	   maxandargmaxCompile     (ctx) == GA_NO_ERROR &&
+	   maxandargmaxSchedule    (ctx) == GA_NO_ERROR &&
+	   maxandargmaxInvoke      (ctx) == GA_NO_ERROR){
+		return maxandargmaxCleanup(ctx);
+	}else{
+		return maxandargmaxCleanup(ctx);
+	}
+}
+
+/**
+ * @brief Check whether axis numbered v is already in the given set of axes.
+ *
+ * @param [in]  v
+ * @param [in]  set
+ * @param [in]  setLen
+ * @param [out] where
+ * @return Non-zero if the set is non-empty and v is in it; Zero otherwise.
+ */
+
+static int   axisInSet                          (int                v,
+                                                 const int*         set,
+                                                 size_t             setLen,
+                                                 size_t*            where){
+	size_t i;
+
+	for(i=0;i<setLen;i++){
+		if(set[i] == v){
+			if(where){*where = i;}
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * @brief Append a comma-separated list of indices, whose name contains an
+ *        incrementing integer, to a string buffer.
+ *
+ *
+ * @param [in]  s         The string buffer to which to append.
+ * @param [in]  prologue  Text that is prepended in front and NOT repeated.
+ * @param [in]  prefix    Text that is prepended in front of the integer and
+ *                        repeated.
+ * @param [in]  startIdx  First value of the integer (inclusive)
+ * @param [in]  endIdx    Last  value of the integer (exclusive)
+ * @param [in]  suffix    Text that is appended after the integer, followed by
+ *                        a comma if it isn't the last index, and repeated.
+ * @param [in]  epilogue  Text that is appended and NOT repeated.
+ */
+
+static void  appendIdxes                        (strb*              s,
+                                                 const char*        prologue,
+                                                 const char*        prefix,
+                                                 int                startIdx,
+                                                 int                endIdx,
+                                                 const char*        suffix,
+                                                 const char*        epilogue){
+	int i;
+
+	prologue = prologue ? prologue : "";
+	prefix   = prefix   ? prefix   : "";
+	suffix   = suffix   ? suffix   : "";
+	epilogue = epilogue ? epilogue : "";
+
+	strb_appends(s, prologue);
+	for(i=startIdx;i<endIdx;i++){
+		strb_appendf(s, "%s%d%s%s", prefix, i, suffix, &","[i==endIdx-1]);
+	}
+	strb_appends(s, epilogue);
+}
+
+/**
+ * @brief Check the sanity of the arguments, in agreement with the
+ *        documentation for GpuArray_maxandargmax().
+ *
+ *        Also initialize certain parts of the context.
+ *
+ * @return GA_INVALID_ERROR if arguments invalid; GA_NO_ERROR otherwise.
+ */
+
+static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx){
+	int i;
+
+	/**
+	 * We initialize certain parts of the context.
+	 */
+
+	ctx->ret           = GA_NO_ERROR;
+	ctx->axisList      = NULL;
+	ctx->gpuCtx        = NULL;
+
+	ctx->dstMaxType    = ctx->dstArgmaxType = NULL;
+	ctx->ndh           = 0;
+	ctx->sourceCode    = NULL;
+
+	ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0;
+	ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1;
+	ctx->gridSize  [0] = ctx->gridSize  [1] = ctx->gridSize  [2] = 1;
+	ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1;
+
+	ctx->srcStepsGD    = ctx->srcSizeGD     = ctx->chunkSizeGD   =
+	ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL;
+
+
+	/* Insane src or reduxLen? */
+	if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 ||
+	    ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){
+		return ctx->ret=GA_INVALID_ERROR;
+	}
+
+	/* Insane or duplicate list entry? */
+	for(i=0;i<ctx->reduxLen;i++){
+		if(ctx->reduxList[i] <  0                            ||
+		   ctx->reduxList[i] >= (int)ctx->src->nd            ||
+		   axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
+			return ctx->ret=GA_INVALID_ERROR;
+		}
+	}
+
+	/* Unknown type? */
+	ctx->dstMaxType    = gpuarray_get_type(ctx->src->typecode)->cluda_name;
+	ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE)          ->cluda_name;
+	if(!ctx->dstMaxType || !ctx->dstArgmaxType){
+		return ctx->ret=GA_INVALID_ERROR;
+	}
+
+	/* GPU context non-existent? */
+	ctx->gpuCtx        = GpuArray_context(ctx->src);
+	if(!ctx->gpuCtx){
+		return ctx->ret=GA_INVALID_ERROR;
+	}
+
+
+	/**
+	 * We initialize some more parts of the context, using the guarantees
+	 * we now have about the sanity of the arguments.
+	 */
+
+	ctx->nds = ctx->src->nd;
+	ctx->ndr = ctx->reduxLen;
+	ctx->ndd = ctx->nds - ctx->ndr;
+
+	return ctx->ret;
+}
+
+/**
+ * @brief Select which axes (up to 3) will be assigned to hardware
+ *        dimensions.
+ */
+
+static int   maxandargmaxSelectHwAxes           (maxandargmax_ctx*  ctx){
+	int    i, j, maxI = 0;
+	size_t maxV;
+
+	ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3;
+
+	/**
+	 * The ctx->hwAxisLen largest axes are selected and assigned in
+	 * descending order to X, Y, Z.
+	 */
+
+	for(i=0;i<ctx->ndh;i++){
+		maxV = 0;
+
+		for(j=0;j<ctx->nds;j++){
+			if(!axisInSet(j, ctx->hwAxisList, i,        0) &&
+			   !axisInSet(j, ctx->reduxList,  ctx->ndr, 0) &&
+			   ctx->src->dimensions[j] >= maxV){
+				maxV = ctx->src->dimensions[j];
+				maxI = j;
+			}
+		}
+
+		ctx->hwAxisList[i] = maxI;
+	}
+
+	return ctx->ret=GA_NO_ERROR;
+}
+
+/**
+ * @brief Generate the kernel code for MaxAndArgmax.
+ *
+ * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
+ */
+
+static int   maxandargmaxGenSource              (maxandargmax_ctx*  ctx){
+	/* Compute internal axis remapping. */
+	ctx->axisList = malloc(ctx->nds * sizeof(unsigned));
+	if(!ctx->axisList){
+		return ctx->ret=GA_MEMORY_ERROR;
+	}
+	maxandargmaxComputeAxisList(ctx);
+
+	/* Generate kernel proper. */
+	strb_ensure(&ctx->s, 5*1024);
+	maxandargmaxAppendKernel(ctx);
+	free(ctx->axisList);
+	ctx->axisList   = NULL;
+	ctx->sourceCode = strb_cstr(&ctx->s);
+	if(!ctx->sourceCode){
+		return ctx->ret=GA_MEMORY_ERROR;
+	}
+
+	/* Return it. */
+	return ctx->ret=GA_NO_ERROR;
+}
+static void  maxandargmaxAppendKernel           (maxandargmax_ctx*  ctx){
+	strb_appends           (&ctx->s, "#include \"cluda.h\"\n");
+	maxandargmaxAppendTypedefs         (ctx);
+	maxandargmaxAppendPrototype        (ctx);
+	strb_appends           (&ctx->s, "{\n");
+	maxandargmaxAppendOffsets          (ctx);
+	maxandargmaxAppendIndexDeclarations(ctx);
+	maxandargmaxAppendRangeCalculations(ctx);
+	maxandargmaxAppendLoops            (ctx);
+	strb_appends           (&ctx->s, "}\n");
+}
+static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx){
+	strb_appends(&ctx->s, "/* Typedefs */\n");
+	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the array being processed. */\n", ctx->dstMaxType);
+	strb_appendf(&ctx->s, "typedef %s     X;/* Index type: signed 32/64-bit. */\n",          ctx->dstArgmaxType);
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  maxandargmaxAppendPrototype        (maxandargmax_ctx*  ctx){
+	strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T*        src,\n");
+	strb_appends(&ctx->s, "                         const X         srcOff,\n");
+	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSteps,\n");
+	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSize,\n");
+	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        chunkSize,\n");
+	strb_appends(&ctx->s, "                         GLOBAL_MEM T*              dstMax,\n");
+	strb_appends(&ctx->s, "                         const X         dstMaxOff,\n");
+	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstMaxSteps,\n");
+	strb_appends(&ctx->s, "                         GLOBAL_MEM X*              dstArgmax,\n");
+	strb_appends(&ctx->s, "                         const X         dstArgmaxOff,\n");
+	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstArgmaxSteps)");
+}
+static void  maxandargmaxAppendOffsets          (maxandargmax_ctx*  ctx){
+	strb_appends(&ctx->s, "\t/* Add offsets */\n");
+	strb_appends(&ctx->s, "\tsrc       = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src       + srcOff);\n");
+	strb_appends(&ctx->s, "\tdstMax    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dstMax    + dstMaxOff);\n");
+	strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArgmax + dstArgmaxOff);\n");
+	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n");
+}
+static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx){
+	int i;
+	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n");
+
+	strb_appends(&ctx->s, "\tX bi0 = GID_0,        bi1 = GID_1,        bi2 = GID_2;\n");
+	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
+	strb_appends(&ctx->s, "\tX ti0 = LID_0,        ti1 = LID_1,        ti2 = LID_2;\n");
+	strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0,  gi1 = bi1*bd1+ti1,  gi2 = bi2*bd2+ti2;\n");
+	if(ctx->ndh>0){
+		strb_appends(&ctx->s, "\tX ");
+		for(i=0;i<ctx->ndh;i++){
+			strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
+			             i, i, (i==ctx->ndh-1) ? ";\n" : ", ");
+		}
+	}
+
+	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");
+
+	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "",        ";\n");}
+	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Dim",     ";\n");}
+	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Start",   ";\n");}
+	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "End",     ";\n");}
+	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "SStep",   ";\n");}
+	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "MStep",   ";\n");}
+	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "AStep",   ";\n");}
+	if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}
+
+	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n");
+}
+static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
+	size_t hwDim;
+	int    i;
+
+	/* Use internal remapping when computing the ranges for this thread. */
+	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");
+
+	for(i=0;i<ctx->nds;i++){
+		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n", i, ctx->axisList[i]);
+	}
+	for(i=0;i<ctx->nds;i++){
+		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->axisList[i]);
+	}
+	for(i=0;i<ctx->ndd;i++){
+		strb_appendf(&ctx->s, "\ti%dMStep   = dstMaxSteps[%d];\n", i, i);
+	}
+	for(i=0;i<ctx->ndd;i++){
+		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgmaxSteps[%d];\n", i, i);
+	}
+	for(i=ctx->nds-1;i>=ctx->ndd;i--){
+		/**
+		 * If this is the last index, it's the first cumulative dimension
+		 * product we generate, and thus we initialize to 1.
+		 */
+
+		if(i == ctx->nds-1){
+			strb_appendf(&ctx->s, "\ti%dPDim    = 1;\n", i);
+		}else{
+			strb_appendf(&ctx->s, "\ti%dPDim    = i%dPDim * i%dDim;\n", i, i+1, i+1);
+		}
+	}
+	for(i=0;i<ctx->nds;i++){
+		/**
+		 * Up to 3 dimensions get to rely on hardware loops.
+		 * The others, if any, have to use software looping beginning at 0.
+		 */
+
+		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
+			strb_appendf(&ctx->s, "\ti%dStart   = gi%d * ci%d;\n", i, hwDim, hwDim);
+		}else{
+			strb_appendf(&ctx->s, "\ti%dStart   = 0;\n", i);
+		}
+	}
+	for(i=0;i<ctx->nds;i++){
+		/**
+		 * Up to 3 dimensions get to rely on hardware loops.
+		 * The others, if any, have to use software looping beginning at 0.
+		 */
+
+		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
+			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + ci%d;\n", i, i, hwDim);
+		}else{
+			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + i%dDim;\n", i, i, i);
+		}
+	}
+
+	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n");
+}
+static void  maxandargmaxAppendLoops            (maxandargmax_ctx*  ctx){
+	strb_appends(&ctx->s, "\t/**\n");
+	strb_appends(&ctx->s, "\t * FREE LOOPS.\n");
+	strb_appends(&ctx->s, "\t */\n");
+	strb_appends(&ctx->s, "\t\n");
+
+	maxandargmaxAppendLoopMacroDefs  (ctx);
+	maxandargmaxAppendLoopOuter      (ctx);
+	maxandargmaxAppendLoopMacroUndefs(ctx);
+}
+static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
+	int i;
+
+	/**
+	 * FOROVER Macro
+	 */
+
+	strb_appends(&ctx->s, "#define FOROVER(idx)    for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
+
+	/**
+	 * ESCAPE Macro
+	 */
+
+	strb_appends(&ctx->s, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");
+
+	/**
+	 * SRCINDEXER Macro
+	 */
+
+	appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ")   (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + ");
+	for(i=0;i<ctx->nds;i++){
+		strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n                                            ", i, i);
+	}
+	strb_appends(&ctx->s, "0))\n");
+
+	/**
+	 * RDXINDEXER Macro
+	 */
+
+	appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ")              (");
+	for(i=ctx->ndd;i<ctx->nds;i++){
+		strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n                                        ", i, i);
+	}
+	strb_appends(&ctx->s, "0)\n");
+
+	/**
+	 * DSTMINDEXER Macro
+	 */
+
+	appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + ");
+	for(i=0;i<ctx->ndd;i++){
+		strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n                                                  ", i, i);
+	}
+	strb_appends(&ctx->s, "0))\n");
+
+	/**
+	 * DSTAINDEXER Macro
+	 */
+
+	appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + ");
+	for(i=0;i<ctx->ndd;i++){
+		strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n                                                     ", i, i);
+	}
+	strb_appends(&ctx->s, "0))\n");
+}
+static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
+	int i;
+
+	/**
+	 * Outer Loop Header Generation
+	 */
+
+	for(i=0;i<ctx->ndd;i++){
+		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
+	}
+
+	/**
+	 * Inner Loop Generation
+	 */
+
+	maxandargmaxAppendLoopInner(ctx);
+
+	/**
+	 * Outer Loop Trailer Generation
+	 */
+
+	for(i=0;i<ctx->ndd;i++){
+		strb_appends(&ctx->s, "\t}\n");
+	}
+}
+static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx){
+	int i;
+
+	/**
+	 * Inner Loop Prologue
+	 */
+
+	strb_appends(&ctx->s, "\t/**\n");
+	strb_appends(&ctx->s, "\t * Reduction initialization.\n");
+	strb_appends(&ctx->s, "\t */\n");
+	strb_appends(&ctx->s, "\t\n");
+
+	appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", "");
+	if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");}
+	appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n");
+
+	appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n");
+
+	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t/**\n");
+	strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n");
+	strb_appends(&ctx->s, "\t */\n");
+	strb_appends(&ctx->s, "\t\n");
+
+	/**
+	 * Inner Loop Header Generation
+	 */
+
+	for(i=ctx->ndd;i<ctx->nds;i++){
+		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
+	}
+
+	/**
+	 * Inner Loop Body Generation
+	 */
+
+	appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n");
+	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\tif(V > maxV){\n");
+	strb_appends(&ctx->s, "\t\tmaxV = V;\n");
+	appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
+	strb_appends(&ctx->s, "\t}\n");
+
+	/**
+	 * Inner Loop Trailer Generation
+	 */
+
+	for(i=ctx->ndd;i<ctx->nds;i++){
+		strb_appends(&ctx->s, "\t}\n");
+	}
+	strb_appends(&ctx->s, "\t\n");
+
+	/**
+	 * Inner Loop Epilogue Generation
+	 */
+
+	strb_appends(&ctx->s, "\t/**\n");
+	strb_appends(&ctx->s, "\t * Destination writeback.\n");
+	strb_appends(&ctx->s, "\t */\n");
+	strb_appends(&ctx->s, "\t\n");
+	appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n");
+	appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n");
+}
+static void  maxandargmaxAppendLoopMacroUndefs  (maxandargmax_ctx*  ctx){
+	strb_appends(&ctx->s, "#undef FOROVER\n");
+	strb_appends(&ctx->s, "#undef ESCAPE\n");
+	strb_appends(&ctx->s, "#undef SRCINDEXER\n");
+	strb_appends(&ctx->s, "#undef RDXINDEXER\n");
+	strb_appends(&ctx->s, "#undef DSTMINDEXER\n");
+	strb_appends(&ctx->s, "#undef DSTAINDEXER\n");
+}
+static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx){
+	int i, f=0;
+
+	for(i=0;i<ctx->nds;i++){
+		if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
+			continue;
+		}
+		ctx->axisList[f++] = i;
+	}
+	memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
+}
+
+/**
+ * @brief Compile the kernel from source code.
+ *
+ * @return
+ */
+
+static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx){
+	const int    ARG_TYPECODES[]   = {
+		GA_BUFFER, /* src */
+		GA_SIZE,   /* srcOff */
+		GA_BUFFER, /* srcSteps */
+		GA_BUFFER, /* srcSize */
+		GA_BUFFER, /* chnkSize */
+		GA_BUFFER, /* dstMax */
+		GA_SIZE,   /* dstMaxOff */
+		GA_BUFFER, /* dstMaxSteps */
+		GA_BUFFER, /* dstArgmax */
+		GA_SIZE,   /* dstArgmaxOff */
+		GA_BUFFER  /* dstArgmaxSteps */
+	};
+	const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
+	const char*  SRCS[1];
+
+	SRCS[0] = ctx->sourceCode;
+
+	ctx->ret = GpuKernel_init(&ctx->kernel,
+	                          ctx->gpuCtx,
+	                          1,
+	                          SRCS,
+	                          NULL,
+	                          "maxandargmax",
+	                          ARG_TYPECODES_LEN,
+	                          ARG_TYPECODES,
+	                          0,
+	                          (char**)0);
+	free(ctx->sourceCode);
+	ctx->sourceCode = NULL;
+
+	return ctx->ret;
+}
+
+/**
+ * Compute a good thread block size / grid size / software chunk size for Nvidia.
+ */
+
+static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx){
+	int            i;
+	size_t         warpMod;
+	size_t         bestWarpMod  = 1;
+	unsigned       bestWarpAxis = 0;
+	uint64_t       maxLg;
+	uint64_t       maxLs[3];
+	uint64_t       maxGg;
+	uint64_t       maxGs[3];
+	uint64_t       dims [3];
+	double         slack[3];
+	ga_factor_list factBS[3];
+	ga_factor_list factGS[3];
+	ga_factor_list factCS[3];
+
+
+	/**
+	 * Obtain the constraints of our problem.
+	 */
+
+	size_t warpSize,
+	       maxL, maxL0, maxL1, maxL2,  /* Maximum total and per-dimension thread/block sizes */
+	       maxG, maxG0, maxG1, maxG2;  /* Maximum total and per-dimension block /grid  sizes */
+	gpukernel_property(ctx->kernel.k,  GA_KERNEL_PROP_PREFLSIZE, &warpSize);
+	gpukernel_property(ctx->kernel.k,  GA_KERNEL_PROP_MAXLSIZE,  &maxL);
+	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXLSIZE0,    &maxL0);
+	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXLSIZE1,    &maxL1);
+	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXLSIZE2,    &maxL2);
+	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXGSIZE0,    &maxG0);
+	maxG = maxG0;
+	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXGSIZE1,    &maxG1);
+	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXGSIZE2,    &maxG2);
+
+	/**
+	 * Prepare inputs to the solver.
+	 *
+	 * This involves, amongst others,
+	 * - Initializing the blockSize, gridSize and chunkSize factor lists for all
+	 *   hardware dimensions.
+	 * - Finding on which hardware axis is it optimal to place the warpSize factor.
+	 */
+
+	maxLg    = maxL;
+	maxLs[0] = maxL0, maxLs[1]=maxL1, maxLs[2]=maxL2;
+	maxGg    = maxG;
+	maxGs[0] = maxG0, maxGs[1]=maxG1, maxGs[2]=maxG2;
+	dims[0]  = dims[1]  = dims[2]  = 1;
+	slack[0] = slack[1] = slack[2] = 1.1;
+
+	for(i=0;i<ctx->ndh;i++){
+		dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]];
+		gaIFLInit(&factBS[i]);
+		gaIFLInit(&factGS[i]);
+		gaIFLInit(&factCS[i]);
+
+		warpMod = dims[i]%warpSize;
+		if(bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){
+			bestWarpAxis = i;
+			bestWarpMod  = warpMod;
+		}
+	}
+
+	if(ctx->ndh > 0){
+		dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize;
+		gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]);
+	}
+
+	/**
+	 * Factorization job. We'll steadily increase the slack in case of failure
+	 * in order to ensure we do get a factorization, which we place into
+	 * chunkSize.
+	 */
+
+	for(i=0;i<ctx->ndh;i++){
+		while(!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){
+			/**
+			 * Error! Failed to factorize dimension i with given slack and
+			 * k-smoothness constraints! Increase slack. Once slack reaches
+			 * 2.0 it will factorize guaranteed.
+			 */
+
+			slack[i] += 0.1;
+		}
+	}
+
+	/**
+	 * Invoke the scheduler.
+	 *
+	 * The scheduler will move some factors from chunkSize into blockSize and
+	 * gridSize, improving performance.
+	 */
+
+	gaIFLSchedule(ctx->ndh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS);
+
+	/* Output. */
+	for(i=0;i<ctx->ndh;i++){
+		ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]);
+		ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]);
+		ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]);
+	}
+
+	/* Return. */
+	return ctx->ret=GA_NO_ERROR;
+}
+
+/**
+ * Invoke the kernel.
+ */
+
+static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
+	void* args[11];
+
+	/**
+	 * Argument Marshalling. This the grossest gross thing in here.
+	 */
+
+	const int flags       = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
+	ctx->srcStepsGD       = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
+	                                      ctx->src->strides,       flags, 0);
+	ctx->srcSizeGD        = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
+	                                      ctx->src->dimensions,    flags, 0);
+	ctx->chunkSizeGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
+	                                      ctx->chunkSize,          flags, 0);
+	ctx->dstMaxStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+	                                      ctx->dstMax->strides,    flags, 0);
+	ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+	                                      ctx->dstArgmax->strides, flags, 0);
+	args[ 0] = (void*) ctx->src->data;
+	args[ 1] = (void*)&ctx->src->offset;
+	args[ 2] = (void*) ctx->srcStepsGD;
+	args[ 3] = (void*) ctx->srcSizeGD;
+	args[ 4] = (void*) ctx->chunkSizeGD;
+	args[ 5] = (void*) ctx->dstMax->data;
+	args[ 6] = (void*)&ctx->dstMax->offset;
+	args[ 7] = (void*) ctx->dstMaxStepsGD;
+	args[ 8] = (void*) ctx->dstArgmax->data;
+	args[ 9] = (void*)&ctx->dstArgmax->offset;
+	args[10] = (void*) ctx->dstArgmaxStepsGD;
+
+	if(ctx->srcStepsGD      &&
+	   ctx->srcSizeGD       &&
+	   ctx->chunkSizeGD     &&
+	   ctx->dstMaxStepsGD   &&
+	   ctx->dstArgmaxStepsGD){
+		ctx->ret = GpuKernel_call(&ctx->kernel,
+		                          ctx->ndh>0 ? ctx->ndh : 1,
+		                          ctx->gridSize,
+		                          ctx->blockSize,
+		                          0,
+		                          args);
+	}else{
+		ctx->ret = GA_MEMORY_ERROR;
+	}
+
+	gpudata_release(ctx->srcStepsGD);
+	gpudata_release(ctx->srcSizeGD);
+	gpudata_release(ctx->chunkSizeGD);
+	gpudata_release(ctx->dstMaxStepsGD);
+	gpudata_release(ctx->dstArgmaxStepsGD);
+
+	return ctx->ret;
+}
+
+/**
+ * Cleanup
+ */
+
+static int   maxandargmaxCleanup                (maxandargmax_ctx*  ctx){
+	free(ctx->axisList);
+	free(ctx->sourceCode);
+	ctx->axisList       = NULL;
+	ctx->sourceCode     = NULL;
+
+	return ctx->ret;
+}
diff --git a/src/gpuarray_types.c b/src/gpuarray_types.c
index 719d5b1910..01477a9336 100644
--- a/src/gpuarray_types.c
+++ b/src/gpuarray_types.c
@@ -40,7 +40,7 @@ typedef struct _quad {
       int16_t exp;
       uint16_t hi;
       uint32_t lo;
-    };
+    } s;
     uint128_t raw;
   } u;
 } ga_quad;
diff --git a/src/gpuarray_util.c b/src/gpuarray_util.c
index 177c632663..5b2ccc2797 100644
--- a/src/gpuarray_util.c
+++ b/src/gpuarray_util.c
@@ -13,9 +13,6 @@
  * phase. Once we go stable, this will move to 0 and go up from
  * there.
  */
-const int gpuarray_api_major = -9997;
-const int gpuarray_api_minor = 1;
-
 static gpuarray_type **custom_types = NULL;
 static int n_types = 0;
 static gpuarray_type no_type = {NULL, 0, 0, -1};
@@ -173,7 +170,7 @@ void gpuarray_elemwise_collapse(unsigned int n, unsigned int *_nd,
     int collapse = 1;
     for (k = 0; k < n; k++) {
       collapse &= (strs[k] == NULL ||
-                   strs[k][i - 1] == dims[i] * strs[k][i]);
+                   strs[k][i - 1] == (ssize_t)dims[i] * strs[k][i]);
     }
     if (collapse) {
       dims[i-1] *= dims[i];
diff --git a/src/head.py b/src/head.py
new file mode 100644
index 0000000000..ef27d89549
--- /dev/null
+++ b/src/head.py
@@ -0,0 +1,38 @@
+# Used to generate the string tables to embed the cluda headers.
+# Usage: python head.py <file>
+# This will output <file>.c
+
+def wrt(f, n, b):
+    f.write(b',')
+    n += 1
+    if n > 10:
+        f.write(b'\n')
+        n = 0
+    else:
+        f.write(b' ')
+    f.write(b"0x%02x" % (b,))
+    return n
+
+
+def convert(src, dst):
+    src_name = src.replace('.', '_')
+    with open(src, 'rb') as f:
+        src_data = f.read()
+    with open(dst, 'wb') as f:
+        f.write(b'static const char %s[] = {\n' % (src_name.encode('utf-8'),))
+        first = True
+        n = 0
+        for b in bytearray(src_data):
+            if b == 0:
+                raise ValueError('NUL in file')
+            if first:
+                f.write(b"0x%02x" % (b,))
+                first = False
+            else:
+                n = wrt(f, n, b)
+        wrt(f, n, 0)
+        f.write(b'};\n')
+
+if __name__ == '__main__':
+    import sys
+    convert(sys.argv[1], sys.argv[1] + '.c')
diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt
new file mode 100644
index 0000000000..861349dfda
--- /dev/null
+++ b/src/loaders/CMakeLists.txt
@@ -0,0 +1,10 @@
+set_rel(LOADERS_SRC
+dyn_load.c
+libcuda.c
+libnvrtc.c
+libcublas.c
+libnccl.c
+libopencl.c
+libclblas.c
+libclblast.c
+)
diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c
new file mode 100644
index 0000000000..08442ed1bf
--- /dev/null
+++ b/src/loaders/dyn_load.c
@@ -0,0 +1,57 @@
+#include "dyn_load.h"
+#include "util/error.h"
+
+#if defined(__unix__) || defined(__APPLE__)
+
+#include <dlfcn.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+void *ga_load_library(const char *name, error *e) {
+  void *res = dlopen(name, RTLD_LAZY|RTLD_LOCAL);
+  if (res == NULL)
+    error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": %s", name, dlerror());
+  return res;
+}
+
+void *ga_func_ptr(void *h, const char *name, error *e) {
+  void *res = dlsym(h, name);
+  if (res == NULL)
+    error_fmt(e, GA_LOAD_ERROR, "Could not find symbol \"%s\": %s", name, dlerror());
+  return res;
+}
+
+#else
+
+/* Should be windows */
+#include <windows.h>
+
+static inline void error_win(const char* name, error *e) {
+  char msgbuf[512];
+  DWORD err = GetLastError();
+  DWORD len = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM|
+                             FORMAT_MESSAGE_IGNORE_INSERTS,
+                             NULL, err, 0, msgbuf, 512, NULL);
+  if (len == 0)
+    error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": error code %X", name, err);
+  else
+    error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": %s", name, msgbuf);
+}
+
+void *ga_load_library(const char *name, error *e) {
+  void *res = LoadLibrary(name);
+  if (res == NULL)
+    error_win(name, e);
+  return res;
+}
+
+void *ga_func_ptr(void *h, const char *name, error *e) {
+  void *res = (void *)GetProcAddress(h, name);
+  if (res == NULL)
+    error_win(name, e);
+  return res;
+}
+
+#endif
diff --git a/src/loaders/dyn_load.h b/src/loaders/dyn_load.h
new file mode 100644
index 0000000000..bc62ebf2a6
--- /dev/null
+++ b/src/loaders/dyn_load.h
@@ -0,0 +1,9 @@
+#ifndef UTIL_DYN_LOAD_H
+#define UTIL_DYN_LOAD_H
+
+#include "util/error.h"
+
+void *ga_load_library(const char *name, error *e);
+void *ga_func_ptr(void *h, const char *name, error *e);
+
+#endif
diff --git a/src/loaders/libclblas.c b/src/loaders/libclblas.c
new file mode 100644
index 0000000000..4acaeb8120
--- /dev/null
+++ b/src/loaders/libclblas.c
@@ -0,0 +1,45 @@
+#include <stdlib.h>
+
+#include "libclblas.h"
+#include "dyn_load.h"
+#include "gpuarray/error.h"
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+static const char libname[] = "clBLAS.dll";
+#else /* Unix */
+#ifdef __APPLE__
+static const char libname[] = "libclBLAS.dylib";
+#else
+static const char libname[] = "libclBLAS.so";
+#endif
+#endif
+
+#define DEF_PROC(ret, name, args) t##name *name
+
+#include "libclblas.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(ret, name, args)                 \
+  name = (t##name *)ga_func_ptr(lib, #name, e);   \
+  if (name == NULL) {                             \
+    return e->code;                               \
+  }
+
+static int loaded = 0;
+
+int load_libclblas(error *e) {
+  void *lib;
+
+  if (loaded)
+    return GA_NO_ERROR;
+
+  lib = ga_load_library(libname, e);
+  if (lib == NULL)
+    return e->code;
+
+  #include "libclblas.fn"
+
+  loaded = 1;
+  return GA_NO_ERROR;
+}
diff --git a/src/loaders/libclblas.fn b/src/loaders/libclblas.fn
new file mode 100644
index 0000000000..f56a2a1393
--- /dev/null
+++ b/src/loaders/libclblas.fn
@@ -0,0 +1,12 @@
+DEF_PROC(clblasStatus, clblasSetup, (void));
+DEF_PROC(void, clblasTeardown, (void));
+
+
+DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem  dotProduct, size_t  offDP, const cl_mem  X, size_t  offx, int  incx, const cl_mem Y, size_t  offy, int  incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint  numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
+DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem  dotProduct, size_t  offDP, const cl_mem  X, size_t  offx, int  incx, const cl_mem Y, size_t  offy, int  incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint  numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
+DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
+DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
+DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
+DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
+DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
+DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events));
diff --git a/src/loaders/libclblas.h b/src/loaders/libclblas.h
new file mode 100644
index 0000000000..b30409bd81
--- /dev/null
+++ b/src/loaders/libclblas.h
@@ -0,0 +1,61 @@
+#ifndef LOADER_LIBCLBLAS_H
+#define LOADER_LIBCLBLAS_H
+
+#include "util/error.h"
+#include "libopencl.h"
+
+/** @cond NEVER */
+typedef enum clblasOrder_ {
+  clblasRowMajor,
+  clblasColumnMajor
+} clblasOrder;
+
+typedef enum clblasTranspose_ {
+  clblasNoTrans,
+  clblasTrans,
+  clblasConjTrans
+} clblasTranspose;
+
+typedef enum clblasStatus_ {
+  clblasSuccess = CL_SUCCESS,
+  /* Rest is not exposed from here */
+  clblasNotImplemented = -1024,
+  clblasNotInitialized,
+  clblasInvalidMatA,
+  clblasInvalidMatB,
+  clblasInvalidMatC,
+  clblasInvalidVecX,
+  clblasInvalidVecY,
+  clblasInvalidDim,
+  clblasInvalidLeadDimA,
+  clblasInvalidLeadDimB,
+  clblasInvalidLeadDimC,
+  clblasInvalidIncX,
+  clblasInvalidIncY,
+  clblasInsufficientMemMatA,
+  clblasInsufficientMemMatB,
+  clblasInsufficientMemMatC,
+  clblasInsufficientMemVecX,
+  clblasInsufficientMemVecY,
+} clblasStatus;
+
+/** @endcond */
+
+int load_libclblas(error *);
+
+/** @cond NEVER */
+#define DEF_PROC(ret, name, args) typedef ret t##name args
+
+#include "libclblas.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(ret, name, args) extern t##name *name
+
+#include "libclblas.fn"
+
+#undef DEF_PROC
+
+/** @endcond */
+
+#endif
diff --git a/src/loaders/libclblast.c b/src/loaders/libclblast.c
new file mode 100644
index 0000000000..759b9cd476
--- /dev/null
+++ b/src/loaders/libclblast.c
@@ -0,0 +1,45 @@
+#include <stdlib.h>
+
+#include "libclblast.h"
+#include "dyn_load.h"
+#include "gpuarray/error.h"
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+static const char libname[] = "clblast.dll";
+#else /* Unix */
+#ifdef __APPLE__
+static const char libname[] = "libclblast.dylib";
+#else
+static const char libname[] = "libclblast.so";
+#endif
+#endif
+
+#define DEF_PROC(ret, name, args) t##name *name
+
+#include "libclblast.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(ret, name, args)                 \
+  name = (t##name *)ga_func_ptr(lib, #name, e);   \
+  if (name == NULL) {                             \
+    return e->code;                               \
+  }
+
+static int loaded = 0;
+
+int load_libclblast(error *e) {
+  void *lib;
+
+  if (loaded)
+    return GA_NO_ERROR;
+
+  lib = ga_load_library(libname, e);
+  if (lib == NULL)
+    return e->code;
+
+  #include "libclblast.fn"
+
+  loaded = 1;
+  return GA_NO_ERROR;
+}
diff --git a/src/loaders/libclblast.fn b/src/loaders/libclblast.fn
new file mode 100644
index 0000000000..2eb029937b
--- /dev/null
+++ b/src/loaders/libclblast.fn
@@ -0,0 +1,12 @@
+DEF_PROC(CLBlastStatusCode, CLBlastHdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event));
+DEF_PROC(CLBlastStatusCode, CLBlastSdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event));
+DEF_PROC(CLBlastStatusCode, CLBlastDdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event));
+DEF_PROC(CLBlastStatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event));
+DEF_PROC(CLBlastStatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event));
+DEF_PROC(CLBlastStatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event));
+DEF_PROC(CLBlastStatusCode, CLBlastHgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_half beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event));
+DEF_PROC(CLBlastStatusCode, CLBlastSgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event));
+DEF_PROC(CLBlastStatusCode, CLBlastDgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *events));
+DEF_PROC(CLBlastStatusCode, CLBlastHger, (Layout order, size_t M, size_t N, cl_half alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event));
+DEF_PROC(CLBlastStatusCode, CLBlastSger, (Layout order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event));
+DEF_PROC(CLBlastStatusCode, CLBlastDger, (Layout order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event));
diff --git a/src/loaders/libclblast.h b/src/loaders/libclblast.h
new file mode 100644
index 0000000000..07ea817260
--- /dev/null
+++ b/src/loaders/libclblast.h
@@ -0,0 +1,71 @@
+#ifndef LOADER_LIBCLBLAST_H
+#define LOADER_LIBCLBLAST_H
+
+#include "util/error.h"
+#include "libopencl.h"
+
+/** @cond NEVER */
+
+typedef enum Layout_ {
+  kRowMajor = 101,
+  kColMajor = 102
+} Layout;
+
+typedef enum Transpose_ {
+  kNo = 111,
+  kYes = 112,
+  kConjugate = 113
+} Transpose;
+
+typedef enum CLBLastStatusCode_ {
+  kSuccess = 0,
+  /* Rest is not exposed from here */
+  CLBlastNotImplemented            = -1024,
+  CLBlastInvalidMatrixA            = -1022,
+  CLBlastInvalidMatrixB            = -1021,
+  CLBlastInvalidMatrixC            = -1020,
+  CLBlastInvalidVectorX            = -1019,
+  CLBlastInvalidVectorY            = -1018,
+  CLBlastInvalidDimension          = -1017,
+  CLBlastInvalidLeadDimA           = -1016,
+  CLBlastInvalidLeadDimB           = -1015,
+  CLBlastInvalidLeadDimC           = -1014,
+  CLBlastInvalidIncrementX         = -1013,
+  CLBlastInvalidIncrementY         = -1012,
+  CLBlastInsufficientMemoryA       = -1011,
+  CLBlastInsufficientMemoryB       = -1010,
+  CLBlastInsufficientMemoryC       = -1009,
+  CLBlastInsufficientMemoryX       = -1008,
+  CLBlastInsufficientMemoryY       = -1007,
+
+  CLBlastInvalidLocalMemUsage      = -2046,
+  CLBlastNoHalfPrecision           = -2045,
+  CLBlastNoDoublePrecision         = -2044,
+  CLBlastInvalidVectorScalar       = -2043,
+  CLBlastInsufficientMemoryScalar  = -2042,
+  CLBlastDatabaseError             = -2041,
+  CLBlastUnknownError              = -2040,
+  CLBlastUnexpectedError           = -2039,
+} CLBlastStatusCode;
+
+/** @endcond */
+
+int load_libclblast(error *);
+
+/** @cond NEVER */
+
+#define DEF_PROC(ret, name, args) typedef ret t##name args
+
+#include "libclblast.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(ret, name, args) extern t##name *name
+
+#include "libclblast.fn"
+
+#undef DEF_PROC
+
+/** @endcond */
+
+#endif
diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c
new file mode 100644
index 0000000000..b810f10bc2
--- /dev/null
+++ b/src/loaders/libcublas.c
@@ -0,0 +1,79 @@
+/* To be able to use snprintf with any compiler including MSVC2008. */
+#include <private_config.h>
+
+#include "libcublas.h"
+#include "dyn_load.h"
+#include "gpuarray/error.h"
+
+#define DEF_PROC(name, args) t##name *name
+#define DEF_PROC_V2(name, args) DEF_PROC(name, args)
+#define DEF_PROC_OPT(name, args) DEF_PROC(name, args)
+
+#include "libcublas.fn"
+
+#undef DEF_PROC_OPT
+#undef DEF_PROC_V2
+#undef DEF_PROC
+
+#define STRINGIFY(X) #X
+
+#define DEF_PROC(name, args)                      \
+  name = (t##name *)ga_func_ptr(lib, #name, e);   \
+  if (name == NULL) {                             \
+    return e->code;                               \
+  }
+
+#define DEF_PROC_OPT(name, args)                \
+  name = (t##name *)ga_func_ptr(lib, #name, e);
+
+#define DEF_PROC_V2(name, args)                                   \
+  name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2), e);    \
+  if (name == NULL) {                                             \
+    return e->code;                                               \
+  }
+
+static int loaded = 0;
+
+int load_libcublas(int major, int minor, error *e) {
+  void *lib;
+
+  if (loaded)
+    return GA_NO_ERROR;
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  {
+    char libname[64];
+    int n;
+    #ifdef DEBUG
+    fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor);
+    #endif
+    n = snprintf(libname, sizeof(libname), "cublas64_%d%d.dll", major, minor);
+    if (n < 0 || n >= sizeof(libname))
+      return error_set(e, GA_SYS_ERROR, "snprintf");
+    lib = ga_load_library(libname, e);
+  }
+#else /* Unix */
+#ifdef __APPLE__
+  {
+    char libname[128];
+    int n;
+    #ifdef DEBUG
+    fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor);
+    #endif
+    n = snprintf(libname, sizeof(libname), "/Developer/NVIDIA/CUDA-%d.%d/lib/libcublas.dylib", major, minor);
+    if (n < 0 || n >= sizeof(libname))
+      return error_set(e, GA_SYS_ERROR, "snprintf");
+    lib = ga_load_library(libname, e);
+  }
+#else
+  lib = ga_load_library("libcublas.so", e);
+#endif
+#endif
+  if (lib == NULL)
+    return e->code;
+
+#include "libcublas.fn"
+
+  loaded = 1;
+  return GA_NO_ERROR;
+}
diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn
new file mode 100644
index 0000000000..c97bc52386
--- /dev/null
+++ b/src/loaders/libcublas.fn
@@ -0,0 +1,31 @@
+DEF_PROC_V2(cublasCreate, (cublasHandle_t *handle));
+DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle));
+
+DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId));
+DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode));
+DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t *mode));
+DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode));
+
+
+DEF_PROC_V2(cublasSdot, (cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result));
+DEF_PROC_V2(cublasDdot, (cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result));
+
+DEF_PROC_V2(cublasSgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha,  const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc));
+DEF_PROC_V2(cublasDgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha,  const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc));
+
+DEF_PROC_V2(cublasSgemv, (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy));
+DEF_PROC_V2(cublasDgemv, (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *alpha, const double *A, int lda, const double *x, int incx, const double *beta, double *y, int incy));
+
+DEF_PROC_V2(cublasSger, (cublasHandle_t handle, int m, int n, const float *alpha, const float *x, int incx, const float *y, int incy, float *A, int lda));
+DEF_PROC_V2(cublasDger, (cublasHandle_t handle, int m, int n, const double *alpha, const double *x, int incx, const double *y, int incy, double *A, int lda));
+
+DEF_PROC_OPT(cublasSgemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const float *beta, void *C, cudaDataType Ctype, int ldc));
+
+DEF_PROC_OPT(cublasGemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType, cublasGemmAlgo_t algo));
+
+DEF_PROC(cublasSgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, const float *beta, float *Carray[], int ldc, int batchCount));
+DEF_PROC(cublasDgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, const double *beta, double *Carray[], int ldc, int batchCount));
+
+DEF_PROC_OPT(cublasHgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half *alpha, const __half *A, int lda, long long int strideA, const __half *B, int ldb, long long int strideB, const __half *beta, __half *C, int ldc, long long int strideC, int batchCount));
+DEF_PROC_OPT(cublasSgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, long long int strideA, const float *B, int ldb, long long int strideB, const float *beta, float *C, int ldc, long long int strideC, int batchCount));
+DEF_PROC_OPT(cublasDgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, long long int strideA, const double *B, int ldb, long long int strideB, const double *beta, double *C, int ldc, long long int strideC, int batchCount));
diff --git a/src/loaders/libcublas.h b/src/loaders/libcublas.h
new file mode 100644
index 0000000000..4ebf4e44f8
--- /dev/null
+++ b/src/loaders/libcublas.h
@@ -0,0 +1,126 @@
+#ifndef LOADER_LIBCUBLAS_H
+#define LOADER_LIBCUBLAS_H
+
+#include "util/error.h"
+//TODO: how to have it work with align?
+typedef struct {//__align__(2) {
+  unsigned short x;
+} __half;
+
+
+/** @cond NEVER */
+
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI
+#endif
+
+typedef enum cudaDataType_t
+{
+  CUDA_R_16F= 2, // real as a half
+  CUDA_C_16F= 6, // complex as a pair of half numbers
+  CUDA_R_32F= 0, // real as a float
+  CUDA_C_32F= 4, // complex as a pair of float numbers
+  CUDA_R_64F= 1, // real as a double
+  CUDA_C_64F= 5, // complex as a pair of double numbers
+  CUDA_R_8I= 3,  // real as a signed char
+  CUDA_C_8I= 7,   // complex as a pair of signed char numbers
+  CUDA_R_8U= 8,  // real as a unsigned char
+  CUDA_C_8U= 9,  // complex as a pair of unsigned char numbers
+  CUDA_R_32I= 10,  // real as a signed int
+  CUDA_C_32I= 11,  // complex as a pair of signed int numbers
+  CUDA_R_32U= 12,  // real as a unsigned int
+  CUDA_C_32U= 13   // complex as a pair of unsigned int numbers
+} cudaDataType;
+
+typedef cudaDataType cudaDataType_t;
+
+typedef enum {
+    CUBLAS_GEMM_DFALT               = -1,
+    CUBLAS_GEMM_ALGO0               =  0,
+    CUBLAS_GEMM_ALGO1               =  1,
+    CUBLAS_GEMM_ALGO2               =  2,
+    CUBLAS_GEMM_ALGO3               =  3,
+    CUBLAS_GEMM_ALGO4               =  4,
+    CUBLAS_GEMM_ALGO5               =  5,
+    CUBLAS_GEMM_ALGO6               =  6,
+    CUBLAS_GEMM_ALGO7               =  7,
+    CUBLAS_GEMM_ALGO8               =  8,
+    CUBLAS_GEMM_ALGO9               =  9,
+    CUBLAS_GEMM_ALGO10              =  10,
+    CUBLAS_GEMM_ALGO11              =  11,
+    CUBLAS_GEMM_ALGO12              =  12,
+    CUBLAS_GEMM_ALGO13              =  13,
+    CUBLAS_GEMM_ALGO14              =  14,
+    CUBLAS_GEMM_ALGO15              =  15,
+    CUBLAS_GEMM_ALGO16              =  16,
+    CUBLAS_GEMM_ALGO17              =  17,
+    CUBLAS_GEMM_DFALT_TENSOR_OP     =  99,
+    CUBLAS_GEMM_ALGO0_TENSOR_OP     =  100,
+    CUBLAS_GEMM_ALGO1_TENSOR_OP     =  101,
+    CUBLAS_GEMM_ALGO2_TENSOR_OP     =  102
+} cublasGemmAlgo_t;
+
+typedef struct CUstream_st *cudaStream_t;
+
+typedef enum {
+  CUBLAS_STATUS_SUCCESS         =0,
+  CUBLAS_STATUS_NOT_INITIALIZED =1,
+  CUBLAS_STATUS_ALLOC_FAILED    =3,
+  CUBLAS_STATUS_INVALID_VALUE   =7,
+  CUBLAS_STATUS_ARCH_MISMATCH   =8,
+  CUBLAS_STATUS_MAPPING_ERROR   =11,
+  CUBLAS_STATUS_EXECUTION_FAILED=13,
+  CUBLAS_STATUS_INTERNAL_ERROR  =14,
+  CUBLAS_STATUS_NOT_SUPPORTED   =15,
+  CUBLAS_STATUS_LICENSE_ERROR   =16
+} cublasStatus_t;
+
+typedef enum {
+  CUBLAS_OP_N=0,
+  CUBLAS_OP_T=1,
+  CUBLAS_OP_C=2
+} cublasOperation_t;
+
+typedef enum {
+  CUBLAS_POINTER_MODE_HOST   = 0,
+  CUBLAS_POINTER_MODE_DEVICE = 1
+} cublasPointerMode_t;
+
+typedef enum {
+  CUBLAS_ATOMICS_NOT_ALLOWED   = 0,
+  CUBLAS_ATOMICS_ALLOWED       = 1
+} cublasAtomicsMode_t;
+
+typedef struct cublasContext *cublasHandle_t;
+
+/** @endcond */
+
+int load_libcublas(int major, int minor, error *e);
+
+/** @cond NEVER */
+
+#define DEF_PROC(name, args) typedef cublasStatus_t CUBLASWINAPI t##name args
+#define DEF_PROC_V2(name, args) DEF_PROC(name, args)
+#define DEF_PROC_OPT(name, args) DEF_PROC(name, args)
+
+#include "libcublas.fn"
+
+#undef DEF_PROC_OPT
+#undef DEF_PROC_V2
+#undef DEF_PROC
+
+#define DEF_PROC(name, args) extern t##name *name
+#define DEF_PROC_V2(name, args) DEF_PROC(name, args)
+#define DEF_PROC_OPT(name, args) DEF_PROC(name, args)
+
+#include "libcublas.fn"
+
+#undef DEF_PROC_OPT
+#undef DEF_PROC_V2
+#undef DEF_PROC
+
+/** @endcond */
+
+#endif
diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c
new file mode 100644
index 0000000000..729c832a62
--- /dev/null
+++ b/src/loaders/libcuda.c
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libcuda.h"
+#include "dyn_load.h"
+#include "gpuarray/error.h"
+#include "util/error.h"
+
+/* This code is inspired from the dynamic loading code in the samples */
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+static char libname[] = "nvcuda.dll";
+#else /* Unix */
+#ifdef __APPLE__
+static char libname[] = "/Library/Frameworks/CUDA.framework/CUDA";
+#else
+static char libname[] = "libcuda.so";
+#endif
+#endif
+
+#define DEF_PROC(name, args) t##name *name
+#define DEF_PROC_V2(name, args) DEF_PROC(name, args)
+
+#include "libcuda.fn"
+
+#undef DEF_PROC_V2
+#undef DEF_PROC
+
+#define STRINGIFY(X) #X
+
+#define DEF_PROC(name, args)                    \
+  name = (t##name *)ga_func_ptr(lib, #name, e); \
+  if (name == NULL) {                           \
+    return e->code;                             \
+  }
+
+#define DEF_PROC_V2(name, args)                                \
+  name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2), e); \
+  if (name == NULL) {                                          \
+    return e->code;                                            \
+  }
+
+static int loaded = 0;
+
+int load_libcuda(error *e) {
+  void *lib;
+
+  if (loaded)
+    return GA_NO_ERROR;
+
+  lib = ga_load_library(libname, e);
+  if (lib == NULL)
+    return e->code;
+
+  #include "libcuda.fn"
+
+  loaded = 1;
+  return GA_NO_ERROR;
+}
diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn
new file mode 100644
index 0000000000..b65cde22b5
--- /dev/null
+++ b/src/loaders/libcuda.fn
@@ -0,0 +1,59 @@
+DEF_PROC(cuInit, (int flags));
+DEF_PROC(cuDriverGetVersion, (int *driverVersion));
+DEF_PROC(cuGetErrorName, (CUresult error, const char **pStr));
+DEF_PROC(cuGetErrorString, (CUresult error, const char **pStr));
+
+DEF_PROC(cuDeviceGet, (CUdevice *device, int ordinal));
+DEF_PROC(cuDeviceGetCount, (int *count));
+DEF_PROC(cuDeviceGetName, (char *name, int len, CUdevice dev));
+DEF_PROC(cuDeviceGetAttribute, (int *pi, CUdevice_attribute attrib, CUdevice dev));
+DEF_PROC(cuDeviceGetPCIBusId, (char *pciBusId, int len, CUdevice dev));
+
+DEF_PROC(cuDevicePrimaryCtxGetState, (CUdevice dev, unsigned int *flags, int *active));
+DEF_PROC(cuDevicePrimaryCtxSetFlags, (CUdevice dev, unsigned int flags));
+DEF_PROC(cuDevicePrimaryCtxRelease, (CUdevice dev));
+DEF_PROC(cuDevicePrimaryCtxRetain, (CUcontext *pctx, CUdevice dev));
+
+DEF_PROC(cuCtxGetDevice, (CUdevice *device));
+DEF_PROC_V2(cuCtxPushCurrent, (CUcontext ctx));
+DEF_PROC_V2(cuCtxPopCurrent, (CUcontext *pctx));
+
+DEF_PROC(cuLinkCreate, (unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut));
+DEF_PROC(cuLinkAddData, (CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues));
+DEF_PROC(cuLinkComplete, (CUlinkState state, void **cubinOut, size_t *sizeOut));
+DEF_PROC(cuLinkDestroy, (CUlinkState state));
+DEF_PROC(cuModuleLoadData, (CUmodule *module, const void *image));
+DEF_PROC(cuModuleLoadDataEx, (CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues));
+DEF_PROC(cuModuleUnload, (CUmodule hmod));
+DEF_PROC(cuModuleGetFunction, (CUfunction *hfunc, CUmodule hmod, const char *name));
+
+DEF_PROC_V2(cuMemGetInfo, (size_t *free, size_t *total));
+DEF_PROC_V2(cuMemAlloc, (CUdeviceptr *dptr, size_t bytesize));
+DEF_PROC_V2(cuMemFree, (CUdeviceptr dptr));
+DEF_PROC_V2(cuMemAllocHost, (void **pp, size_t bytesize));
+DEF_PROC(cuMemFreeHost, (void *p));
+
+DEF_PROC_V2(cuMemcpyHtoDAsync, (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream));
+DEF_PROC_V2(cuMemcpyHtoD, (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount));
+DEF_PROC_V2(cuMemcpyDtoHAsync, (void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream));
+DEF_PROC_V2(cuMemcpyDtoDAsync, (CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream));
+DEF_PROC(cuMemcpyPeerAsync, (CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream));
+DEF_PROC(cuMemsetD8Async, (CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream));
+
+DEF_PROC(cuLaunchKernel, (CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra));
+
+DEF_PROC(cuFuncGetAttribute, (int *pi, CUfunction_attribute attrib, CUfunction hfunc));
+
+DEF_PROC(cuEventCreate, (CUevent *phEvent, unsigned int Flags));
+DEF_PROC(cuEventRecord, (CUevent hEvent, CUstream hStream));
+DEF_PROC(cuEventSynchronize, (CUevent hEvent));
+DEF_PROC_V2(cuEventDestroy, (CUevent hEvent));
+
+DEF_PROC(cuStreamCreate, (CUstream *phStream, unsigned int Flags));
+DEF_PROC(cuStreamWaitEvent, (CUstream hStream, CUevent hEvent, unsigned int Flags));
+DEF_PROC(cuStreamSynchronize, (CUstream hStream));
+DEF_PROC_V2(cuStreamDestroy, (CUstream hStream));
+
+DEF_PROC(cuIpcGetMemHandle, (CUipcMemHandle *pHandle, CUdeviceptr dptr));
+DEF_PROC(cuIpcOpenMemHandle, (CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags));
+DEF_PROC(cuIpcCloseMemHandle, (CUdeviceptr dptr));
diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h
new file mode 100644
index 0000000000..c8fc7ca968
--- /dev/null
+++ b/src/loaders/libcuda.h
@@ -0,0 +1,230 @@
+#ifndef LOADER_LIBCUDA_H
+#define LOADER_LIBCUDA_H
+
+#include "util/error.h"
+
+/** @cond NEVER */
+
+#ifdef _WIN32
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+typedef enum {
+  CUDA_SUCCESS = 0
+} CUresult;
+
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+typedef int CUdevice;
+typedef struct CUctx_st *CUcontext;
+typedef struct CUmod_st *CUmodule;
+typedef struct CUfunc_st *CUfunction;
+typedef struct CUevent_st *CUevent;
+typedef struct CUstream_st *CUstream;
+typedef struct CUlinkState_st *CUlinkState;
+
+typedef enum CUdevice_attribute_enum CUdevice_attribute;
+typedef enum CUfunction_attribute_enum CUfunction_attribute;
+typedef enum CUevent_flags_enum CUevent_flags;
+typedef enum CUctx_flags_enum CUctx_flags;
+typedef enum CUipcMem_flags_enum CUipcMem_flags;
+typedef enum CUjit_option_enum CUjit_option;
+typedef enum CUjitInputType_enum CUjitInputType;
+
+#define CU_IPC_HANDLE_SIZE 64
+
+typedef struct CUipcMemHandle_st {
+  char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcMemHandle;
+
+/** @endcond */
+
+int load_libcuda(error *);
+
+/** @cond NEVER */
+
+#define DEF_PROC(name, args) typedef CUresult CUDAAPI t##name args
+#define DEF_PROC_V2(name, args) DEF_PROC(name, args)
+
+#include "libcuda.fn"
+
+#undef DEF_PROC_V2
+#undef DEF_PROC
+
+#define DEF_PROC(name, args) extern t##name *name
+#define DEF_PROC_V2(name, args) DEF_PROC(name, args)
+
+#include "libcuda.fn"
+
+#undef DEF_PROC_V2
+#undef DEF_PROC
+
+enum CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
+  CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
+  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
+  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
+  CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
+  CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
+  CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
+  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
+  CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
+  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
+  CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
+  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
+  CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
+  CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
+  CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
+  CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
+  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
+  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
+  CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,
+  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,
+  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,
+  CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,
+  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
+  CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
+  CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,
+  CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,
+  CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,
+  CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,
+  CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,
+  CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,
+  CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
+  CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
+  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,
+  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
+  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
+  CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
+  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
+  CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91
+};
+
+enum CUfunction_attribute_enum {
+  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+  CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+  CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+  CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+  CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+  CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+  CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+  CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7
+};
+
+enum CUevent_flags_enum {
+  CU_EVENT_DEFAULT        = 0x0,
+  CU_EVENT_BLOCKING_SYNC  = 0x1,
+  CU_EVENT_DISABLE_TIMING = 0x2,
+  CU_EVENT_INTERPROCESS   = 0x4
+};
+
+enum CUctx_flags_enum {
+  CU_CTX_SCHED_AUTO          = 0x00,
+  CU_CTX_SCHED_SPIN          = 0x01,
+  CU_CTX_SCHED_YIELD         = 0x02,
+  CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
+  CU_CTX_BLOCKING_SYNC       = 0x04,
+  CU_CTX_MAP_HOST            = 0x08,
+};
+
+enum CUipcMem_flags_enum {
+  CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1
+};
+
+enum CUjit_option_enum {
+    CU_JIT_MAX_REGISTERS = 0,
+    CU_JIT_THREADS_PER_BLOCK,
+    CU_JIT_WALL_TIME,
+    CU_JIT_INFO_LOG_BUFFER,
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_ERROR_LOG_BUFFER,
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_OPTIMIZATION_LEVEL,
+    CU_JIT_TARGET_FROM_CUCONTEXT,
+    CU_JIT_TARGET,
+    CU_JIT_FALLBACK_STRATEGY,
+    CU_JIT_GENERATE_DEBUG_INFO,
+    CU_JIT_LOG_VERBOSE,
+    CU_JIT_GENERATE_LINE_INFO,
+    CU_JIT_CACHE_MODE,
+    CU_JIT_NEW_SM3X_OPT,
+    CU_JIT_FAST_COMPILE,
+    CU_JIT_NUM_OPTIONS
+};
+
+enum CUjitInputType_enum {
+    CU_JIT_INPUT_CUBIN = 0,
+    CU_JIT_INPUT_PTX,
+    CU_JIT_INPUT_FATBINARY,
+    CU_JIT_INPUT_OBJECT,
+    CU_JIT_INPUT_LIBRARY,
+    CU_JIT_NUM_INPUT_TYPES
+};
+
+/** @endcond */
+
+#endif
diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c
new file mode 100644
index 0000000000..08d5643330
--- /dev/null
+++ b/src/loaders/libnccl.c
@@ -0,0 +1,49 @@
+#include <stdlib.h>
+
+#include "libnccl.h"
+#include "dyn_load.h"
+#include "gpuarray/error.h"
+
+#define DEF_PROC(ret, name, args) t##name *name
+
+#include "libnccl.fn"
+
+#undef DEF_PROC
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(__APPLE__)
+/* As far as we know, nccl is not available or buildable on platforms
+   other than linux */
+int load_libnccl(error *e) {
+  return error_set(e, GA_UNSUPPORTED_ERROR,
+                   "NCCL is not available on plaforms other than linux.");
+}
+#else /* Unix */
+static const char libname[] = "libnccl.so";
+
+#define DEF_PROC(ret, name, args)                 \
+  name = (t##name *)ga_func_ptr(lib, #name, e);   \
+  if (name == NULL) {                             \
+    return e->code;                               \
+  }
+
+static int loaded = 0;
+
+int load_libnccl(error *e) {
+  void *lib;
+
+  if (loaded)
+    return GA_NO_ERROR;
+
+  lib = ga_load_library(libname, e);
+  if (lib == NULL)
+    return e->code;
+
+  #include "libnccl.fn"
+
+  if (ga_func_ptr(lib, "ncclGroupStart", e) == NULL)
+    return error_set(e, GA_LOAD_ERROR, "Found NCCL 1.0 but NCCL 2.0 required");
+
+  loaded = 1;
+  return GA_NO_ERROR;
+}
+#endif
diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn
new file mode 100644
index 0000000000..caf365b849
--- /dev/null
+++ b/src/loaders/libnccl.fn
@@ -0,0 +1,11 @@
+DEF_PROC(ncclResult_t, ncclGetUniqueId, (ncclUniqueId* uniqueId));
+DEF_PROC(ncclResult_t, ncclCommInitRank, (ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank));
+DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm));
+DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count));
+DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank));
+DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result));
+DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h
new file mode 100644
index 0000000000..7d70138e03
--- /dev/null
+++ b/src/loaders/libnccl.h
@@ -0,0 +1,54 @@
+#ifndef LOADER_LIBNCCL_H
+#define LOADER_LIBNCCL_H
+
+#include "util/error.h"
+
+/** @cond NEVER */
+
+typedef struct CUstream_st *cudaStream_t;
+typedef struct ncclComm* ncclComm_t;
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+typedef enum { ncclSuccess = 0 } ncclResult_t;
+
+/* Reduction operation selector */
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclNumOps     = 4 } ncclRedOp_t;
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclNumTypes   = 9 } ncclDataType_t;
+
+/** @endcond */
+
+int load_libnccl(error *e);
+
+/* @cond NEVER */
+
+#define DEF_PROC(ret, name, args) typedef ret t##name args
+
+#include "libnccl.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(ret, name, args) extern t##name *name
+
+#include "libnccl.fn"
+
+#undef DEF_PROC
+
+/** @endcond */
+
+#endif
diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c
new file mode 100644
index 0000000000..ef052b79c3
--- /dev/null
+++ b/src/loaders/libnvrtc.c
@@ -0,0 +1,66 @@
+/* To be able to use snprintf with any compiler including MSVC2008. */
+#include <private_config.h>
+
+#include "libcuda.h"
+#include "libnvrtc.h"
+#include "dyn_load.h"
+#include "gpuarray/error.h"
+
+#define DEF_PROC(rt, name, args) t##name *name
+
+#include "libnvrtc.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(rt, name, args)                  \
+  name = (t##name *)ga_func_ptr(lib, #name, e);   \
+  if (name == NULL) {                             \
+    return e->code;                               \
+  }
+
+static int loaded = 0;
+
+int load_libnvrtc(int major, int minor, error *e) {
+  void *lib;
+
+  if (loaded)
+    return GA_NO_ERROR;
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  {
+    char libname[64];
+    int n;
+    #ifdef DEBUG
+    fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor);
+    #endif
+    n = snprintf(libname, sizeof(libname), "nvrtc64_%d%d.dll", major, minor);
+    if (n < 0 || n >= sizeof(libname))
+      return error_set(e, GA_SYS_ERROR, "snprintf");
+
+    lib = ga_load_library(libname, e);
+  }
+#else /* Unix */
+#ifdef __APPLE__
+  {
+    char libname[128];
+    int n;
+    #ifdef DEBUG
+    fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor);
+    #endif
+    n = snprintf(libname, sizeof(libname), "/Developer/NVIDIA/CUDA-%d.%d/lib/libnvrtc.dylib", major, minor);
+    if (n < 0 || n >= sizeof(libname))
+      return error_set(e, GA_SYS_ERROR, "snprintf");
+    lib = ga_load_library(libname, e);
+  }
+#else
+  lib = ga_load_library("libnvrtc.so", e);
+#endif
+#endif
+  if (lib == NULL)
+    return e->code;
+
+  #include "libnvrtc.fn"
+
+  loaded = 1;
+  return GA_NO_ERROR;
+}
diff --git a/src/loaders/libnvrtc.fn b/src/loaders/libnvrtc.fn
new file mode 100644
index 0000000000..3f32036310
--- /dev/null
+++ b/src/loaders/libnvrtc.fn
@@ -0,0 +1,8 @@
+DEF_PROC(nvrtcResult, nvrtcCreateProgram, (nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames));
+DEF_PROC(nvrtcResult, nvrtcCompileProgram, (nvrtcProgram prog, int numOptions, const char **options));
+DEF_PROC(nvrtcResult, nvrtcDestroyProgram, (nvrtcProgram *prog));
+DEF_PROC(nvrtcResult, nvrtcGetProgramLog, (nvrtcProgram prog, char *log));
+DEF_PROC(nvrtcResult, nvrtcGetProgramLogSize, (nvrtcProgram prog, size_t *logSizeRet));
+DEF_PROC(nvrtcResult, nvrtcGetPTX, (nvrtcProgram prog, char *ptx));
+DEF_PROC(nvrtcResult, nvrtcGetPTXSize, (nvrtcProgram prog, size_t *ptxSizeRet));
+DEF_PROC(const char *, nvrtcGetErrorString, (nvrtcResult result));
diff --git a/src/loaders/libnvrtc.h b/src/loaders/libnvrtc.h
new file mode 100644
index 0000000000..4dc2f3ec67
--- /dev/null
+++ b/src/loaders/libnvrtc.h
@@ -0,0 +1,34 @@
+#ifndef LOADER_LIBNVRTC_H
+#define LOADER_LIBNVRTC_H
+
+#include "util/error.h"
+
+/** @cond NEVER */
+
+typedef enum {
+  NVRTC_SUCCESS = 0,
+} nvrtcResult;
+
+typedef struct _nvrtcProgram *nvrtcProgram;
+
+/** @endcond */
+
+int load_libnvrtc(int major, int minor, error *e);
+
+/** @cond NEVER */
+
+#define DEF_PROC(rt, name, args) typedef rt t##name args
+
+#include "libnvrtc.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(rt, name, args) extern t##name *name
+
+#include "libnvrtc.fn"
+
+#undef DEF_PROC
+
+/** @endcond */
+
+#endif
diff --git a/src/loaders/libopencl.c b/src/loaders/libopencl.c
new file mode 100644
index 0000000000..7c38abee32
--- /dev/null
+++ b/src/loaders/libopencl.c
@@ -0,0 +1,45 @@
+#include <stdlib.h>
+
+#include "libopencl.h"
+#include "dyn_load.h"
+#include "gpuarray/error.h"
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+static char libname[] = "OpenCL.dll";
+#else /* Unix */
+#ifdef __APPLE__
+static char libname[] = "/System/Library/Frameworks/OpenCL.framework/OpenCL";
+#else
+static char libname[] = "libOpenCL.so";
+#endif
+#endif
+
+#define DEF_PROC(ret, name, args) t##name *name
+
+#include "libopencl.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(ret, name, args)                 \
+  name = (t##name *)ga_func_ptr(lib, #name, e);   \
+  if (name == NULL) {                             \
+    return e->code;                               \
+  }
+
+static int loaded = 0;
+
+int load_libopencl(error *e) {
+  void *lib;
+
+  if (loaded)
+    return GA_NO_ERROR;
+
+  lib = ga_load_library(libname, e);
+  if (lib == NULL)
+    return e->code;
+
+  #include "libopencl.fn"
+
+  loaded = 1;
+  return GA_NO_ERROR;
+}
diff --git a/src/loaders/libopencl.fn b/src/loaders/libopencl.fn
new file mode 100644
index 0000000000..c86d3b02d6
--- /dev/null
+++ b/src/loaders/libopencl.fn
@@ -0,0 +1,32 @@
+DEF_PROC(cl_context, clCreateContext, (const cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *));
+DEF_PROC(cl_int, clCompileProgram, (cl_program, cl_uint, const cl_device_id *, const char *, cl_uint, cl_program *, const char **,  void (CL_CALLBACK *)(cl_program, void *), void *));
+DEF_PROC(cl_program, clLinkProgram, (cl_context, cl_uint, const cl_device_id *, const char *, cl_uint, const cl_program *, void (CL_CALLBACK *)(cl_program, void *), void *, cl_int *));
+DEF_PROC(cl_mem, clCreateBuffer, (cl_context, cl_mem_flags, size_t, void *, cl_int *));
+DEF_PROC(cl_command_queue, clCreateCommandQueue, (cl_context, cl_device_id, cl_command_queue_properties, cl_int *));
+DEF_PROC(cl_kernel, clCreateKernel, (cl_program, const char *, cl_int *));
+DEF_PROC(cl_program, clCreateProgramWithBinary, (cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *));
+DEF_PROC(cl_program, clCreateProgramWithSource, (cl_context, cl_uint, const char **, const size_t *, cl_int *));
+DEF_PROC(cl_int, clEnqueueReadBuffer, (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *));
+DEF_PROC(cl_int, clEnqueueWriteBuffer, (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *));
+DEF_PROC(cl_int, clEnqueueCopyBuffer, (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *));
+DEF_PROC(cl_int, clEnqueueNDRangeKernel, (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *));
+DEF_PROC(cl_int, clGetContextInfo, (cl_context, cl_context_info, size_t, void *, size_t *));
+DEF_PROC(cl_int, clGetDeviceIDs, (cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *));
+DEF_PROC(cl_int, clGetDeviceInfo, (cl_device_id, cl_device_info, size_t, void *, size_t *));
+DEF_PROC(cl_int, clGetKernelInfo, (cl_kernel, cl_kernel_info, size_t, void *, size_t *));
+DEF_PROC(cl_int, clGetKernelWorkGroupInfo, (cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *));
+DEF_PROC(cl_int, clGetMemObjectInfo, (cl_mem, cl_mem_info, size_t, void *, size_t *));
+DEF_PROC(cl_int, clGetPlatformIDs, (cl_uint, cl_platform_id *, cl_uint *));
+DEF_PROC(cl_int, clGetProgramBuildInfo, (cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *));
+DEF_PROC(cl_int, clGetProgramInfo, (cl_program, cl_program_info, size_t, void *, size_t *));
+DEF_PROC(cl_int, clReleaseCommandQueue, (cl_command_queue));
+DEF_PROC(cl_int, clReleaseContext, (cl_context));
+DEF_PROC(cl_int, clReleaseEvent, (cl_event));
+DEF_PROC(cl_int, clReleaseKernel, (cl_kernel));
+DEF_PROC(cl_int, clReleaseMemObject, (cl_mem));
+DEF_PROC(cl_int, clReleaseProgram, (cl_program));
+DEF_PROC(cl_int, clRetainContext, (cl_context));
+DEF_PROC(cl_int, clRetainEvent, (cl_event));
+DEF_PROC(cl_int, clRetainMemObject, (cl_mem));
+DEF_PROC(cl_int, clSetKernelArg, (cl_kernel, cl_uint, size_t, const void *));
+DEF_PROC(cl_int, clWaitForEvents, (cl_uint, const cl_event *));
\ No newline at end of file
diff --git a/src/loaders/libopencl.h b/src/loaders/libopencl.h
new file mode 100644
index 0000000000..9ed6f513d8
--- /dev/null
+++ b/src/loaders/libopencl.h
@@ -0,0 +1,337 @@
+#ifndef LOADER_LIBOPENCL_H
+#define LOADER_LIBOPENCL_H
+
+#include "util/error.h"
+
+/** @cond NEVER */
+
+#if defined(_WIN32)
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+typedef signed __int32 cl_int;
+typedef unsigned __int32 cl_uint;
+typedef signed __int64 cl_long;
+typedef unsigned __int64 cl_ulong;
+
+typedef unsigned __int16 cl_half;
+typedef float cl_float;
+typedef double cl_double;
+#else
+#include <stdint.h>
+typedef int32_t cl_int __attribute__((aligned(4)));
+typedef uint32_t cl_uint __attribute__((aligned(4)));
+typedef int64_t cl_long __attribute__((aligned(8)));
+typedef uint64_t cl_ulong __attribute__((aligned(8)));
+
+typedef uint16_t cl_half __attribute__((aligned(2)));
+typedef float cl_float __attribute__((aligned(4)));
+typedef double cl_double __attribute__((aligned(8)));
+#endif
+
+typedef cl_uint cl_bool;
+typedef cl_ulong cl_bitfield;
+typedef cl_uint cl_device_info;
+typedef cl_bitfield cl_device_type;
+typedef cl_bitfield cl_command_queue_properties;
+
+typedef intptr_t cl_context_properties;
+typedef cl_uint cl_context_info;
+typedef cl_uint cl_mem_info;
+typedef cl_bitfield cl_mem_flags;
+typedef cl_uint cl_program_info;
+typedef cl_uint cl_program_build_info;
+typedef cl_uint cl_kernel_info;
+typedef cl_uint cl_kernel_work_group_info;
+
+/** @endcond */
+
+int load_libopencl(error *);
+
+/** @cond NEVER */
+
+#define DEF_PROC(ret, name, args) typedef ret CL_API_CALL t##name args
+
+#include "libopencl.fn"
+
+#undef DEF_PROC
+
+#define DEF_PROC(ret, name, args) extern t##name *name
+
+#include "libopencl.fn"
+
+#undef DEF_PROC
+
+/* What follows is a bunch of defines from the official OpenCL spec.
+ * This allows us to build even if there are no OpenCL implementation
+ * present. */
+
+/* Error codes */
+#define CL_SUCCESS                                   0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#define CL_DEVICE_HALF_FP_CONFIG                         0x1033
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#define CL_PROGRAM_IL                              0x1169
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+/** @endcond */
+
+#endif
diff --git a/src/private.h b/src/private.h
index 064dc7a04e..caf87d8c65 100644
--- a/src/private.h
+++ b/src/private.h
@@ -17,6 +17,7 @@
 #include <gpuarray/buffer_collectives.h>
 
 #include "util/strb.h"
+#include "util/error.h"
 #include "cache.h"
 
 #ifdef __cplusplus
@@ -26,9 +27,9 @@ extern "C" {
 }
 #endif
 
-#define ADDR32_MAX   4294967295
-#define SADDR32_MIN -2147483648
-#define SADDR32_MAX  2147483647
+#define ADDR32_MAX   4294967295L
+#define SADDR32_MIN -2147483648L
+#define SADDR32_MAX  2147483647L
 
 struct _gpuarray_buffer_ops;
 typedef struct _gpuarray_buffer_ops gpuarray_buffer_ops;
@@ -44,7 +45,7 @@ typedef struct _gpuarray_comm_ops gpuarray_comm_ops;
   const gpuarray_blas_ops *blas_ops;            \
   const gpuarray_comm_ops *comm_ops;            \
   void *blas_handle;                            \
-  const char* error_msg;                        \
+  error *err;                                   \
   unsigned int refcnt;                          \
   int flags;                                    \
   struct _gpudata *errbuf;                      \
@@ -52,10 +53,23 @@ typedef struct _gpuarray_comm_ops gpuarray_comm_ops;
   char bin_id[64];                              \
   char tag[8]
 
+/* These will go away eventually but are kept to ease the transition for now */
+#define GA_CTX_SINGLE_STREAM 0x01
+#define GA_CTX_MULTI_THREAD  0x02
+
+struct _gpucontext_props {
+  int dev;
+  int sched;
+  int flags;
+  const char *kernel_cache_path;
+  size_t max_cache_size;
+  size_t initial_cache_size;
+};
+
 struct _gpucontext {
   GPUCONTEXT_HEAD;
   void *ctx_ptr;
-  void *private[7];
+  void *private[11];
 };
 
 /* The real gpudata struct is likely bigger but we only care about the
@@ -76,31 +90,28 @@ typedef struct _partial_gpucomm {
 struct _gpuarray_buffer_ops {
   int (*get_platform_count)(unsigned int* platcount);
   int (*get_device_count)(unsigned int platform, unsigned int* devcount);
-  gpucontext *(*buffer_init)(int dev, int flags, int *ret);
+  gpucontext *(*buffer_init)(gpucontext_props *props);
   void (*buffer_deinit)(gpucontext *ctx);
-  gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags,
-                           int *ret);
+  gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags);
   void (*buffer_retain)(gpudata *b);
   void (*buffer_release)(gpudata *b);
-  int (*buffer_share)(gpudata *a, gpudata *b, int *ret);
+  int (*buffer_share)(gpudata *a, gpudata *b);
   int (*buffer_move)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff,
                      size_t sz);
   int (*buffer_read)(void *dst, gpudata *src, size_t srcoff, size_t sz);
   int (*buffer_write)(gpudata *dst, size_t dstoff, const void *src, size_t sz);
   int (*buffer_memset)(gpudata *dst, size_t dstoff, int data);
-  gpukernel *(*kernel_alloc)(gpucontext *ctx, unsigned int count,
-                             const char **strings, const size_t *lengths,
-                             const char *fname, unsigned int numargs,
-                             const int *typecodes, int flags, int *ret,
-                             char **err_str);
+  int (*kernel_alloc)(gpukernel **k, gpucontext *ctx, unsigned int count,
+                      const char **strings, const size_t *lengths,
+                      const char *fname, unsigned int numargs,
+                      const int *typecodes, int flags, char **err_str);
   void (*kernel_retain)(gpukernel *k);
   void (*kernel_release)(gpukernel *k);
   int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a);
   int (*kernel_call)(gpukernel *k, unsigned int n,
-                     const size_t *bs, const size_t *gs,
+                     const size_t *gs, const size_t *ls,
                      size_t shared, void **args);
 
-  int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);
   int (*buffer_sync)(gpudata *b);
   int (*buffer_transfer)(gpudata *dst, size_t dstoff,
                          gpudata *src, size_t srcoff, size_t sz);
@@ -112,7 +123,19 @@ struct _gpuarray_buffer_ops {
 struct _gpuarray_blas_ops {
   int (*setup)(gpucontext *ctx);
   void (*teardown)(gpucontext *ctx);
-  const char *(*error)(gpucontext *ctx);
+
+  int (*hdot)( size_t N,
+    gpudata *X, size_t offX, size_t incX,
+    gpudata *Y, size_t offY, size_t incY,
+    gpudata *Z, size_t offZ);
+  int (*sdot)( size_t N,
+    gpudata *X, size_t offX, size_t incX,
+    gpudata *Y, size_t offY, size_t incY,
+    gpudata *Z, size_t offZ);
+  int (*ddot)( size_t N,
+    gpudata *X, size_t offX, size_t incX,
+    gpudata *Y, size_t offY, size_t incY,
+    gpudata *Z, size_t offZ);
   int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N,
                float alpha, gpudata *A, size_t offA, size_t lda,
                gpudata *X, size_t offX, int incX, float beta,
@@ -203,6 +226,24 @@ struct _gpuarray_blas_ops {
                    gpudata **y, size_t *offY, size_t incY,
                    gpudata **A, size_t *offA, size_t lda,
                    size_t batchCount, int flags);
+  int (*hgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB,
+                 size_t M, size_t N, size_t K, float alpha,
+                 gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+                 gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+                 float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+                 size_t batchCount);
+  int (*sgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB,
+                 size_t M, size_t N, size_t K, float alpha,
+                 gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+                 gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+                 float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+                 size_t batchCount);
+  int (*dgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB,
+                 size_t M, size_t N, size_t K, double alpha,
+                 gpudata *A, size_t offA, size_t lda, ssize_t strideA,
+                 gpudata *B, size_t offB, size_t ldb, ssize_t strideB,
+                 double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC,
+                 size_t batchCount);
 };
 
 struct _gpuarray_comm_ops {
@@ -243,26 +284,90 @@ static inline void *memdup(const void *p, size_t s) {
   return res;
 }
 
-GPUARRAY_LOCAL int GpuArray_is_c_contiguous(const GpuArray *a);
-GPUARRAY_LOCAL int GpuArray_is_f_contiguous(const GpuArray *a);
-GPUARRAY_LOCAL int GpuArray_is_aligned(const GpuArray *a);
+int GpuArray_is_c_contiguous(const GpuArray *a);
+int GpuArray_is_f_contiguous(const GpuArray *a);
+int GpuArray_is_aligned(const GpuArray *a);
 
-GPUARRAY_LOCAL extern const gpuarray_type scalar_types[];
-GPUARRAY_LOCAL extern const gpuarray_type vector_types[];
+extern const gpuarray_type scalar_types[];
+extern const gpuarray_type vector_types[];
 
 /*
  * This function generates the kernel code to perform indexing on var id
  * from planar index 'i' using the dimensions and strides provided.
  */
-GPUARRAY_LOCAL void gpuarray_elem_perdim(strb *sb, unsigned int nd,
-                                         const size_t *dims,
-                                         const ssize_t *str,
-                                         const char *id);
-
-GPUARRAY_LOCAL void gpukernel_source_with_line_numbers(unsigned int count,
-                                                       const char **news,
-                                                       size_t *newl,
-                                                       strb *src);
+void gpuarray_elem_perdim(strb *sb, unsigned int nd,
+                          const size_t *dims,
+                          const ssize_t *str,
+                          const char *id);
+
+void gpukernel_source_with_line_numbers(unsigned int count,
+                                        const char **news,
+                                        size_t *newl,
+                                        strb *src);
+
+static inline uint16_t float_to_half(float value) {
+#define ga__shift 13
+#define ga__shiftSign 16
+
+#define ga__infN 0x7F800000  // flt32 infinity
+#define ga__maxN 0x477FE000 // max flt16 normal as a flt32
+#define ga__minN 0x38800000 // min flt16 normal as a flt32
+#define ga__signN 0x80000000 // flt32 sign bit
+
+#define ga__infC (ga__infN >> ga__shift)
+#define ga__nanN ((ga__infC + 1) << ga__shift) // minimum flt16 nan as a flt32
+#define ga__maxC (ga__maxN >> ga__shift)
+#define ga__minC (ga__minN >> ga__shift)
+
+#define ga__mulN 0x52000000 // (1 << 23) / minN
+
+#define ga__subC 0x003FF // max flt32 subnormal down shifted
+
+#define ga__maxD (ga__infC - ga__maxC - 1)
+#define ga__minD (ga__minC - ga__subC - 1)
+
+  union {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  } v, s;
+
+  uint32_t sign;
+
+  v.f = value;
+  sign = v.si & ga__signN;
+  v.si ^= sign;
+  sign >>= ga__shiftSign; // logical shift
+  s.si = ga__mulN;
+  s.si = (int32_t)(s.f * v.f); // correct subnormals
+  v.si ^= (s.si ^ v.si) & -(ga__minN > v.si);
+  v.si ^= (ga__infN ^ v.si) & -((ga__infN > v.si) & (v.si > ga__maxN));
+  v.si ^= (ga__nanN ^ v.si) & -((ga__nanN > v.si) & (v.si > ga__infN));
+  v.ui >>= ga__shift; // logical shift
+  v.si ^= ((v.si - ga__maxD) ^ v.si) & -(v.si > ga__maxC);
+  v.si ^= ((v.si - ga__minD) ^ v.si) & -(v.si > ga__subC);
+  return (uint16_t)(v.ui | sign);
+
+#undef ga__shift
+#undef ga__shiftSign
+
+#undef ga__infN
+#undef ga__maxN
+#undef ga__minN
+#undef ga__signN
+
+#undef ga__infC
+#undef ga__nanN
+#undef ga__maxC
+#undef ga__minC
+
+#undef ga__mulN
+
+#undef ga__subC
+
+#undef ga__maxD
+#undef ga__minD
+}
 
 #define ISSET(v, fl) ((v) & (fl))
 #define ISCLR(v, fl) (!((v) & (fl)))
diff --git a/src/private_config.h.in b/src/private_config.h.in
index 23db862c4f..ff03831203 100644
--- a/src/private_config.h.in
+++ b/src/private_config.h.in
@@ -22,11 +22,7 @@ extern "C" {
 #ifdef _MSC_VER
 /* God damn Microsoft ... */
 #define snprintf _snprintf
-#endif
-
-#ifdef _MSC_VER
-/* MS VC++ 2008 does not support inline */
-#define inline __inline
+#define strdup _strdup
 #define alloca _alloca
 #endif
 
@@ -39,12 +35,12 @@ extern "C" {
 #define nelems(a) (sizeof(a)/sizeof(a[0]))
 
 #ifndef HAVE_MKSTEMP
-GPUARRAY_LOCAL int mkstemp(char *path);
+int mkstemp(char *path);
 #endif
 
 #ifndef HAVE_STRL
-GPUARRAY_LOCAL size_t strlcpy(char *dst, const char *src, size_t size);
-GPUARRAY_LOCAL size_t strlcat(char *dst, const char *src, size_t size);
+size_t strlcpy(char *dst, const char *src, size_t size);
+size_t strlcat(char *dst, const char *src, size_t size);
 #endif
 
 #ifdef __cplusplus
diff --git a/src/private_cuda.h b/src/private_cuda.h
index 642a9991a4..3e428e79f2 100644
--- a/src/private_cuda.h
+++ b/src/private_cuda.h
@@ -1,11 +1,7 @@
 #ifndef _PRIVATE_CUDA_H
 #define _PRIVATE_CUDA_H
 
-#ifdef __APPLE__
-#include <CUDA/cuda.h>
-#else
-#include <cuda.h>
-#endif
+#include "loaders/libcuda.h"
 
 #include <cache.h>
 
@@ -46,6 +42,13 @@
 /* Keep in sync with the copy in gpuarray/extension.h */
 #define DONTFREE 0x10000000
 
+static inline int error_cuda(error *e, const char *msg, CUresult err) {
+  const char *name, *descr;
+  cuGetErrorName(err, &name);
+  cuGetErrorString(err, &descr);
+  return error_fmt(e, GA_IMPL_ERROR, "%s: %s: %s", msg, name, descr);
+}
+
 #define GA_CUDA_EXIT_ON_ERROR(ctx, cmd) \
   do {                                  \
     int err = (cmd);                    \
@@ -55,28 +58,34 @@
     }                                   \
   } while (0)
 
-#define CUDA_EXIT_ON_ERROR(ctx, cmd)  \
-  do {                                \
-    (ctx)->err = (cmd);               \
-    if ((ctx)->err != CUDA_SUCCESS) { \
-      cuda_exit((ctx));               \
-      return GA_IMPL_ERROR;           \
-    }                                 \
+#define CUDA_EXIT_ON_ERROR(ctx, cmd)            \
+  do {                                          \
+    CUresult err = (cmd);                       \
+    if (err != CUDA_SUCCESS) {                  \
+      cuda_exit((ctx));                         \
+      return error_cuda((ctx)->err, #cmd, err); \
+    }                                           \
   } while (0)
 
 typedef struct _cuda_context {
   GPUCONTEXT_HEAD;
   CUcontext ctx;
-  CUresult err;
   CUstream s;
   CUstream mem_s;
   gpudata *freeblocks;
+  size_t cache_size;
+  size_t max_cache_size;
   cache *kernel_cache;
+  cache *disk_cache; // This is per-context to avoid lock contention
   unsigned int enter;
+  unsigned char major;
+  unsigned char minor;
 } cuda_context;
 
+/** @cond NEVER */
 STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext),
               sizeof_struct_gpucontext_cuda);
+/** @endcond */
 
 /*
  * About freeblocks.
@@ -93,16 +102,12 @@ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext),
  * flag.
  */
 
-#ifdef WITH_NVRTC
 #define ARCH_PREFIX "compute_"
-#else
-#define ARCH_PREFIX "sm_"
-#endif
 
-GPUARRAY_LOCAL cuda_context *cuda_make_ctx(CUcontext ctx, int flags);
-GPUARRAY_LOCAL CUstream cuda_get_stream(cuda_context *ctx);
-GPUARRAY_LOCAL void cuda_enter(cuda_context *ctx);
-GPUARRAY_LOCAL void cuda_exit(cuda_context *ctx);
+cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p);
+CUstream cuda_get_stream(cuda_context *ctx);
+void cuda_enter(cuda_context *ctx);
+void cuda_exit(cuda_context *ctx);
 
 struct _gpudata {
   CUdeviceptr ptr;
@@ -121,12 +126,10 @@ struct _gpudata {
 #endif
 };
 
-GPUARRAY_LOCAL gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p,
-                                      size_t sz);
-GPUARRAY_LOCAL CUdeviceptr cuda_get_ptr(gpudata *g);
-GPUARRAY_LOCAL size_t cuda_get_sz(gpudata *g);
-GPUARRAY_LOCAL int cuda_wait(gpudata *, int);
-GPUARRAY_LOCAL int cuda_record(gpudata *, int);
+gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p, size_t sz);
+size_t cuda_get_sz(gpudata *g);
+int cuda_wait(gpudata *, int);
+int cuda_record(gpudata *, int);
 
 /* private flags are in the upper 16 bits */
 #define CUDA_WAIT_READ  0x10000
@@ -135,8 +138,9 @@ GPUARRAY_LOCAL int cuda_record(gpudata *, int);
 
 #define CUDA_WAIT_ALL   (CUDA_WAIT_READ|CUDA_WAIT_WRITE)
 
-#define CUDA_HEAD_ALLOC 0x40000
-#define CUDA_MAPPED_PTR 0x80000
+#define CUDA_IPC_MEMORY 0x100000
+#define CUDA_HEAD_ALLOC 0x200000
+#define CUDA_MAPPED_PTR 0x400000
 
 struct _gpukernel {
   cuda_context *ctx; /* Keep the context first */
@@ -153,4 +157,6 @@ struct _gpukernel {
 #endif
 };
 
+int get_cc(CUdevice dev, int *maj, int *min, error *e);
+
 #endif
diff --git a/src/private_opencl.h b/src/private_opencl.h
index 34ae92906d..53888dc001 100644
--- a/src/private_opencl.h
+++ b/src/private_opencl.h
@@ -3,12 +3,9 @@
 
 #include "private.h"
 
-#ifdef __APPLE__
-#include <OpenCL/opencl.h>
-#else
-#include <CL/opencl.h>
-#endif
+#include "loaders/libopencl.h"
 
+/** @cond NEVER */
 #ifdef DEBUG
 #include <assert.h>
 
@@ -33,17 +30,53 @@
 #define ASSERT_KER(k)
 #define CLEAR(o)
 #endif
+/** @endcond */
+
+const char *cl_error_string(cl_int);
+
+static inline int error_cl(error *e, const char *msg, cl_int err) {
+  return error_fmt(e, GA_IMPL_ERROR, "%s: %s", msg, cl_error_string(err));
+}
+
+#define CL_CHECK(e, cmd) do {                   \
+    cl_int err = (cmd);                         \
+    if (err != CL_SUCCESS)                      \
+      return error_cl(e, #cmd, err);            \
+  } while(0)
+
+#define CL_CHECKN(e, cmd) do {                  \
+    cl_int err = (cmd);                         \
+    if (err != CL_SUCCESS) {                    \
+      error_cl(e, #cmd, err);                   \
+      return NULL;                              \
+    }                                           \
+  } while(0)
+
+#define CL_GET_PROP(e, fn, obj, prop, val) do {     \
+    size_t sz;                                      \
+    cl_int err;                                     \
+    CL_CHECK(e, fn (obj, prop, 0, NULL, &sz));      \
+    val = malloc(sz);                               \
+    if (val == NULL) return error_sys(e, "malloc"); \
+    err = fn (obj, prop, sz, val, NULL);            \
+    if (err != CL_SUCCESS) {                        \
+      free(val);                                    \
+      val = NULL;                                   \
+      return error_cl(e, #fn, err);                 \
+    }                                               \
+  } while(0)
 
 typedef struct _cl_ctx {
   GPUCONTEXT_HEAD;
   cl_context ctx;
   cl_command_queue q;
   char *exts;
-  char *preamble;
-  cl_int err;
+  char *options;
 } cl_ctx;
 
+/** @cond NEVER */
 STATIC_ASSERT(sizeof(cl_ctx) <= sizeof(gpucontext), sizeof_struct_gpucontext_cl);
+/** @endcond */
 
 struct _gpudata {
   cl_mem buf;
@@ -71,9 +104,9 @@ struct _gpukernel {
 #endif
 };
 
-GPUARRAY_LOCAL cl_ctx *cl_make_ctx(cl_context ctx, int flags);
-GPUARRAY_LOCAL cl_command_queue cl_get_stream(gpucontext *ctx);
-GPUARRAY_LOCAL gpudata *cl_make_buf(gpucontext *c, cl_mem buf);
-GPUARRAY_LOCAL cl_mem cl_get_buf(gpudata *g);
+cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p);
+cl_command_queue cl_get_stream(gpucontext *ctx);
+gpudata *cl_make_buf(gpucontext *c, cl_mem buf);
+cl_mem cl_get_buf(gpudata *g);
 
 #endif
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index 7ae772cb67..de4987538f 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -1,4 +1,7 @@
 set_rel(UTIL_SRC
 strb.c
+error.c
 xxhash.c
+integerfactoring.c
+skein.c
 )
diff --git a/src/util/error.c b/src/util/error.c
new file mode 100644
index 0000000000..b523eccf6d
--- /dev/null
+++ b/src/util/error.c
@@ -0,0 +1,43 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "private_config.h"
+#include "util/error.h"
+
+static error _global_err = {{0}, 0};
+error *global_err = &_global_err;
+
+int error_alloc(error **_e) {
+  error *e;
+  e = calloc(sizeof(error), 1);
+  if (e == NULL) return -1;
+  *_e = e;
+  return 0;
+}
+
+void error_free(error *e) {
+  free(e);
+}
+
+int error_set(error *e, int code, const char *msg) {
+  e->code = code;
+  strlcpy(e->msg, msg, ERROR_MSGBUF_LEN);
+#ifdef DEBUG
+  fprintf(stderr, "(Debug) ERROR %d: %s\n", e->code, e->msg);
+#endif
+  return code;
+}
+
+int error_fmt(error *e, int code, const char *fmt, ...) {
+  va_list ap;
+
+  e->code = code;
+  va_start(ap, fmt);
+  vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap);
+  va_end(ap);
+#ifdef DEBUG
+  fprintf(stderr, "(Debug) ERROR %d: %s\n", e->code, e->msg);
+#endif
+  return code;
+}
diff --git a/src/util/error.h b/src/util/error.h
new file mode 100644
index 0000000000..fc1ecb1663
--- /dev/null
+++ b/src/util/error.h
@@ -0,0 +1,28 @@
+#ifndef UTIL_ERROR_H
+#define UTIL_ERROR_H
+
+#include <errno.h>
+#include <string.h>
+
+#include <gpuarray/error.h>
+
+/* 1024 - 4 for the int that goes after */
+#define ERROR_MSGBUF_LEN 1020
+
+typedef struct _error {
+  char msg[ERROR_MSGBUF_LEN];
+  int code;
+} error;
+
+int error_alloc(error **e);
+void error_free(error *e);
+int error_set(error *e, int code, const char *msg);
+int error_fmt(error *e, int code, const char *fmt, ...);
+
+extern error *global_err;
+
+static inline int error_sys(error *e, const char *msg) {
+  return error_fmt(e, GA_SYS_ERROR, "%s: %s", msg, strerror(errno));
+}
+
+#endif
diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c
new file mode 100644
index 0000000000..c398cdc211
--- /dev/null
+++ b/src/util/integerfactoring.c
@@ -0,0 +1,1599 @@
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <gpuarray/config.h>
+#include <string.h>
+#include "integerfactoring.h"
+
+
+/* Detect when to avoid VLAs. */
+#if defined(_MSC_VER) || defined(__STDC_NO_VLA__)
+#define GA_USING_MALLOC_FOR_VLA 1
+#endif
+
+
+/* Defines */
+#define GA_IS_COMPOSITE      0
+#define GA_IS_PRIME          1
+#define GA_IS_PROBABLY_PRIME 2
+
+
+/**
+ * Static Function Prototypes
+ */
+
+/**
+ * @brief Count trailing zeros of a 64-bit integer.
+ *
+ * @param [in] n  The integer whose trailing zero count is to be computed.
+ * @return     If n != 0, returns trailing zero count; Else returns 64.
+ */
+
+static int      gaICtz(uint64_t n);
+
+/**
+ * @brief Count leading zeros of a 64-bit integer.
+ *
+ * @param [in] n  The integer whose leading zero count is to be computed.
+ * @return     If n != 0, returns leading zero count; Else returns 64.
+ */
+
+static int      gaIClz(uint64_t n);
+
+/**
+ * @brief Integer Modular Addition.
+ *
+ * Computes
+ *
+ *     $$a+b \pmod m$$
+ *
+ * efficiently for 64-bit unsigned integers a, b, m.
+ */
+
+static uint64_t gaIAddMod    (uint64_t a, uint64_t b, uint64_t m);
+
+/**
+ * @brief Integer Modular Subtraction.
+ *
+ * Computes
+ *
+ *     $$a-b \pmod m$$
+ *
+ * efficiently for 64-bit unsigned integers a, b, m.
+ */
+
+static uint64_t gaISubMod    (uint64_t a, uint64_t b, uint64_t m);
+
+/**
+ * @brief Integer Modular Average.
+ *
+ * Computes
+ *
+ *     $$\frac{a+b}{2} \pmod m$$
+ *
+ * efficiently for 64-bit unsigned integers a, b, m.
+ */
+
+static uint64_t gaIAvgMod    (uint64_t a, uint64_t b, uint64_t m);
+
+/**
+ * @brief Integer Modular Multiplication.
+ *
+ * Computes
+ *
+ *     $$a*b \pmod m$$
+ *
+ * efficiently for 64-bit unsigned integers a, b, m.
+ */
+
+static uint64_t gaIMulMod    (uint64_t a, uint64_t b, uint64_t m);
+
+/**
+ * @brief Integer Modular Exponentiation.
+ *
+ * Computes
+ *
+ *     $$x^a \pmod m$$
+ *
+ * efficiently for 64-bit unsigned integers x, a, m.
+ */
+
+static uint64_t gaIPowMod    (uint64_t x, uint64_t a, uint64_t m);
+
+/**
+ * @brief Jacobi Symbol
+ *
+ * Computes the Jacobi symbol, notated
+ *
+ *     $$(a/n)$$
+ *
+ * efficiently for 64-bit unsigned integers a, n.
+ */
+
+static int      gaIJacobiSymbol(uint64_t a, uint64_t n);
+
+/**
+ * @brief Strong Fermat base-a probable prime test.
+ *
+ * @param [in] n  An odd integer >= 3.
+ * @param [in] a  A witness integer > 0.
+ * @return Non-zero if n is a strong probable prime to base a and zero if n is
+ *         composite.
+ */
+
+static int      gaIIsPrimeStrongFermat(uint64_t n, uint64_t a);
+
+/**
+ * @brief Strong Lucas probable prime test.
+ *
+ * The function uses Selfridge's Method A for selecting D,P,Q.
+ *
+ * @param [in] n  An odd integer >= 3.
+ * @return Non-zero if n is a strong probable prime and zero if n is composite.
+ */
+
+static int      gaIIsPrimeStrongLucas(uint64_t n);
+
+/**
+ * @brief Round up positive n to next 2-, 3- or 5-smooth number and report its
+ *        factorization.
+ */
+
+static int      gaIFactorize2Smooth(uint64_t n, ga_factor_list* fl);
+static int      gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl);
+static int      gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl);
+
+/**
+ * @brief Satisfy individual product limits on "from" by moving factors to
+ *        corresponding "to" list.
+ */
+
+static void     gaIFLScheduleSatisfyInd(const int       n,
+                                        ga_factor_list* from,
+                                        ga_factor_list* to,
+                                        const uint64_t* maxInd);
+
+/**
+ * @brief Satisfy global product limit on "from" by moving factors to
+ *        corresponding "to" list.
+ */
+
+static void     gaIFLScheduleSatisfyTot(const int       n,
+                                        ga_factor_list* from,
+                                        ga_factor_list* to,
+                                        const uint64_t  maxTot);
+
+/**
+ * @brief Optimize "to" by moving factors from "from", under both individual
+ *        and global limits.
+ */
+
+static void     gaIFLScheduleOpt(const int       n,
+                                 ga_factor_list* from,
+                                 ga_factor_list* to,
+                                 const uint64_t  maxTot,
+                                 const uint64_t* maxInd);
+
+/**
+ * @brief Schedule block/grid/chunk size, integer version, n checked >= 0.
+ */
+
+static void     gaIScheduleChecked(const int       n,
+                                   const uint64_t  maxBtot,
+                                   const uint64_t* maxBind,
+                                   const uint64_t  maxGtot,
+                                   const uint64_t* maxGind,
+                                   uint64_t*       bs,
+                                   uint64_t*       gs,
+                                   uint64_t*       cs);
+
+
+
+/**
+ * Function Definitions
+ */
+
+static int      gaICtz       (uint64_t n){
+#if __GNUC__ >= 4
+	return n ? __builtin_ctzll(n) : 64;
+#else
+	int z;
+
+	for(z=0;z<64;z++){
+		if((n>>z) & 1){break;}
+	}
+
+	return z;
+#endif
+}
+
+static int      gaIClz       (uint64_t n){
+#if __GNUC__ >= 4
+	return n ? __builtin_clzll(n) : 64;
+#else
+	int z;
+
+	for(z=63;z>=0;z--){
+		if((n>>z) & 1){break;}
+	}
+
+	return 63-z;
+#endif
+}
+
+static uint64_t gaIAddMod    (uint64_t a, uint64_t b, uint64_t m){
+	a %= m;
+	b %= m;
+
+	if(m-a > b){
+		return a+b;
+	}else{
+		return a+b-m;
+	}
+}
+
+static uint64_t gaISubMod    (uint64_t a, uint64_t b, uint64_t m){
+	a %= m;
+	b %= m;
+
+	if(a >= b){
+		return a-b;
+	}else{
+		return a-b+m;
+	}
+}
+
+static uint64_t gaIAvgMod    (uint64_t a, uint64_t b, uint64_t m){
+	uint64_t s = gaIAddMod(a,b,m);
+
+	if(s&1){
+		return (s>>1)+(m>>1)+(s&m&1);
+	}else{
+		return s>>1;
+	}
+}
+
+static uint64_t gaIMulMod    (uint64_t a, uint64_t b, uint64_t m){
+#if (__GNUC__ >= 4) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
+	uint64_t r;
+
+	asm(
+	    "mul %2\n\t"
+	    "div %3\n\t"
+	    : "=&d"(r), "+a"(a)   /* Outputs */
+	    : "r"(b),  "r"(m)     /* Inputs */
+	    : "cc"
+	);
+
+	return r;
+#elif ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ >= 16
+	/* Hardcore GCC 4.6+ optimization jazz */
+	return ((unsigned __int128)a * (unsigned __int128)b) % m;
+#else
+	const uint64_t TWOPOW32 = (uint64_t)1<<32;
+	int i;
+
+	a %= m;
+	b %= m;
+
+	if(m <= TWOPOW32){
+		/**
+		 * Fast path: When performing modulo arithmetic on values <= 2^32,
+		 * (a*b) % m gives the correct answer.
+		 */
+
+		return (a*b) % m;
+	}else{
+		/**
+		 * Slow path: Have to simulate 128-bit arithmetic long division.
+		 */
+
+		uint64_t ah   = a>>32;
+		uint64_t al   = (uint32_t)a;
+		uint64_t bh   = b>>32;
+		uint64_t bl   = (uint32_t)b;
+
+		uint64_t ahbh = ah*bh;
+		uint64_t ahbl = ah*bl;
+		uint64_t albh = al*bh;
+		uint64_t albl = al*bl;
+
+		uint64_t md   = ahbl+albh;
+
+		uint64_t lo   = albl + (md<<32);
+		uint64_t hi   = ahbh + (md>>32);
+
+		/* Propagate carry-outs from `md` and `lo` into `hi` */
+		if(lo < albl){hi++;}
+		if(md < ahbl){hi+=TWOPOW32;}
+
+		/**
+		 * Begin 128-bit-by-64-bit remainder.
+		 *
+		 * 1) Cut down `hi` mod `m`. This implements the first few iterations
+		 *    of a shift-and-subtract loop, leaving only 64 iterations to go.
+		 * 2) Iterate 64 times:
+		 *     2.1) Shift left [hi:lo] by 1 bit, into [newHi:newLo].
+		 *     2.2) If:
+		 *         2.2.1) newHi < hi, then there was an overflow into bit 128.
+		 *                The value [1:newHi:newLo] is definitely larger than
+		 *                m, so we subtract. This situation can only occur if
+		 *                m > 2^63.
+		 *         2.2.2) newHi > m, then we must subtract m out of newHi in
+		 *                order to bring back newHi within the range [0, m).
+		 * 3) The modulo is in hi.
+		 */
+
+		hi %= m;
+		for(i=0;i<64;i++){
+			uint64_t newLo = (lo<<1);
+			uint64_t newHi = (hi<<1) + (newLo<lo);
+
+			if(newHi < hi || newHi > m){newHi -= m;}
+
+			hi = newHi;
+			lo = newLo;
+		}
+
+		return hi;
+	}
+#endif
+}
+
+static uint64_t gaIPowMod    (uint64_t x, uint64_t a, uint64_t m){
+	uint64_t r;
+
+	/**
+	 * Special cases (order matters!):
+	 * - A modulo of 0 makes no sense and a modulo of 1 implies a return value
+	 *   of 0, since the result must be integer.
+	 * - An exponent of 0 requires a return value of 1.
+	 * - A base of 0 or 1 requires a return value of 0 or 1.
+	 * - An exponent of 1 requires a return value of x.
+	 * - An exponent of 2 can be handled by the modulo multiplication directly.
+	 */
+
+	if(m<=1){
+		return 0;
+	}
+
+	x %= m;
+
+	if(a==0){
+		return 1;
+	}else if(x<=1){
+		return x;
+	}else if(a==1){
+		return x;
+	}else if(a==2){
+		return gaIMulMod(x,x,m);
+	}
+
+	/**
+	 * Otherwise, perform modular exponentiation by squaring.
+	 */
+
+	r = 1;
+	while(a){
+		if(a&1){
+			r = gaIMulMod(r, x, m);
+		}
+
+		x = gaIMulMod(x, x, m);
+		a >>= 1;
+	}
+
+	return r;
+}
+
+static int      gaIJacobiSymbol(uint64_t a, uint64_t n){
+	int      s=0;
+	uint64_t e, a1, n1;
+
+	a %= n;
+
+	if(a == 1 || n == 1){
+		return 1;
+	}
+
+	if(a == 0){
+		return 0;
+	}
+
+	e  = gaICtz(a);
+	a1 = a >> e;
+
+	if(e%2 == 0){
+		s =  1;
+	}else if(n%8 == 1 || n%8 == 7){
+		s =  1;
+	}else if(n%8 == 3 || n%8 == 5){
+		s = -1;
+	}
+
+	if(n%4 == 3 && a1%4 == 3){
+		s = -s;
+	}
+
+	n1 = n%a1;
+	return s*gaIJacobiSymbol(n1,a1);
+}
+
+static int      gaIIsPrimeStrongFermat(uint64_t n, uint64_t a){
+	/**
+	 * The Fermat strong probable prime test the Miller-Rabin test relies upon
+	 * uses integer "witnesses" in an attempt at proving the number composite.
+	 * Should it fail to prove an integer composite, it reports the number as
+	 * "probably prime". However, if the witnesses are chosen carefully, the
+	 * Miller-Rabin test can be made deterministic below a chosen threshold.
+	 *
+	 * One can use the primes 2 to 37 in order to ensure the correctness of the
+	 * identifications for integers under 2^64.
+	 *
+	 * Jim Sinclair has found that the seven witnesses
+	 *     2, 325, 9375, 28178, 450775, 9780504, 1795265022
+	 * also deterministically classify all integers <2^64.
+	 *
+	 *
+	 * The Fermat strong probable prime test states that, for integers
+	 *             n = d*2^s+1,  d odd, s integer >= 0
+	 *             a             integer (chosen witness)
+	 * n is a Fermat strong probable prime if
+	 *     a^(d    ) =  1 mod n       or
+	 *     a^(d*2^r) = -1 mod n       for any integer r, 0 <= r < s.
+	 *
+	 *
+	 * The justification for this comes from Fermat's Little Theorem: If n is
+	 * prime and a is any integer, then the following always holds:
+	 *           a^n =  a mod n
+	 * If n is prime and a is coprime to n, then the following always holds:
+	 *       a^(n-1) =  1 mod n
+	 *
+	 *
+	 * In effect, the logic goes
+	 *
+	 *   A:   The number  n  is prime.                               (Statement)
+	 *   B:   The number  n  does not divide a.                      (Statement)
+	 *   C:   a^(  n-1)       =  1 mod n                             (Statement)
+	 *   D:   The commutative ring Z/nZ is a finite field.           (Statement)
+	 *   E:   Finite fields are unique factorization domains.        (Statement)
+	 *   F:   x^2 = 1 mod n factorizes as (x+1)(x-1) = 0 mod n.      (Statement)
+	 *   G:   x^2 mod n only has the trivial square roots 1 and -1   (Statement)
+	 *   H:   The number  n  is odd and >= 3.                        (Statement)
+	 *   I:   The number n-1 equals d*2^s, with d,s int > 0, d odd.  (Statement)
+	 *   J:   a^(    d)       =   1 mod n                            (Statement)
+	 *   K:   a^(d*2^r)       =  -1 mod n   for some 0 <= r < s.     (Statement)
+	 *   L:   a^(d*2^(r+1))   =   1 mod n   for some 0 <= r < s.     (Statement)
+	 *   M:   a^(d*2^r)      != +-1 mod n   AND                      (Statement)
+	 *        a^(d*2^(r+1))   =   1 mod n   for some 0 <= r < s.
+	 *
+	 *   A&B           -->  C                 (Proposition:     Fermat's Little Theorem)
+	 *   !C            -->  !(A&B) = !A|!B    (Contrapositive:  Fermat's Little Theorem)
+	 *   A             <->  D                 (Proposition)
+	 *   E                                    (Proposition:     By definition)
+	 *   F                                    (Proposition:     x^2-x+x-1 = x^2-1 mod n)
+	 *   D&E&F         -->  G                 (Proposition:     (x+1)(x-1) is the only
+	 *                                                           factorization)
+	 *   !G            -->  !D|!E|!F          (Contrapositive:  See above)
+	 *   H&I&J         -->  C                 (Proposition:     Squaring  1 gives 1)
+	 *   H&I&K         -->  L                 (Proposition:     Squaring -1 gives 1)
+	 *   H&I&L         -->  C                 (Proposition:     1, squared or not, gives 1)
+	 *   H&I&K         -->  C                 (Hypothetical Syllogism)
+	 *   H&I&(J|K)     -->  C                 (Union)
+	 *   H&I&!(J|K)    -->  M|!C              (Proposition:     Either squaring
+	 *                                                            a^(d*2^(s-1)) != +-1 mod n
+	 *                                                          gives a 1, in which case
+	 *                                                          M holds, or it does not
+	 *                                                          give 1 and therefore
+	 *                                                            a^(n-1) != 1 mod n)
+	 *                                                          and thus !C holds.
+	 *   H&I&!(J|K)    -->  H&I&M | !A | !B   (Absorbtion, Hypothetical Syllogism)
+	 *   H&I&M         -->  !G                (Proposition:     x^2 = 1 mod n but x!=+1,
+	 *                                                          so x^2 - 1 has roots
+	 *                                                          other than +-1)
+	 *   H&I&M         -->  !D|!E|!F          (Modus Tollens)
+	 *   H&I&M         -->  !D                (Disjunctive Syllogism)
+	 *   H&I&M         -->  !A                (Biconditional)
+	 *   H&I&!(J|K)    -->  !A | !A | !B      (Hypothethical Syllogism)
+	 *   H&I&!(J|K)&B  -->  !A | !A           (Absorbtion)
+	 *   H&I&!(J|K)&B  -->  !A | !A           (Disjunctive Syllogism)
+	 *   H&I&!(J|K)&B  -->  !A                (Disjunctive Simplification)
+	 *                           ***** Conclusions: *****
+	 *                            H&I&M         -->  !A
+	 *                            H&I&!(J|K)&B  -->  !A
+	 *
+	 * Broadly speaking, what the above tells us is:
+	 *   - We can't prove n prime (A), but we can prove it composite (!A).
+	 *   - Either H&I&M or H&I&!(J|K)&B prove compositeness.
+	 *   - If H&I&(J|K) for any r, then we've proven C true. If we prove C true,
+	 *     we can't use the contrapositive of Fermat's Little Theorem, so no
+	 *     conclusions about the truth-value of A can be made. The test is
+	 *     inconclusive. Thus this function returns "probably prime".
+	 */
+
+	uint64_t d, x;
+	int64_t  s, r;
+
+	a %= n;
+	if(a==0){
+		return GA_IS_PROBABLY_PRIME;
+	}
+
+	s  = gaICtz(n-1);
+	d  = (n-1) >> s;
+	x  = gaIPowMod(a,d,n);
+
+	if(x==1 || x==n-1){
+		return GA_IS_PROBABLY_PRIME;
+	}
+
+	for(r=0;r<s-1;r++){
+		x = gaIMulMod(x,x,n);
+		if(x==1){
+			return GA_IS_COMPOSITE;
+		}else if(x == n-1){
+			return GA_IS_PROBABLY_PRIME;
+		}
+	}
+
+	return GA_IS_COMPOSITE;
+}
+
+static int      gaIIsPrimeStrongLucas(uint64_t n){
+	uint64_t Dp, Dm, D, K, U, Ut, V, Vt;
+	int      J, r, i;
+
+	/**
+	 * FIPS 186-4 C.3.3 (General) Lucas Probabilistic Primality Test
+	 *
+	 * 1. Test if n is perfect square. If so, return "composite".
+	 *
+	 *     NOTE: The only strong base-2 Fermat pseudoprime squares are
+	 *           1194649 and 12327121;
+	 */
+
+	if(n==1194649 || n==12327121){
+		return GA_IS_COMPOSITE;
+	}
+
+	/**
+	 * 2. Find first D in sequence 5,-7,9,-11,... s.t. Jacobi symbol (D/n) < 1.
+	 *     Iff Jacobi symbol is 0, return "composite".
+	 */
+
+	Dp = gaIAddMod(0, 5, n);
+	Dm = gaISubMod(0, 7, n);
+	while(1){
+		J = gaIJacobiSymbol(Dp, n);
+		if     (J ==  0){return GA_IS_COMPOSITE;}
+		else if(J == -1){D = Dp;break;}
+
+		J = gaIJacobiSymbol(Dm, n);
+		if     (J ==  0){return GA_IS_COMPOSITE;}
+		else if(J == -1){D = Dm;break;}
+
+		Dp = gaIAddMod(Dp, 4, n);
+		Dm = gaISubMod(Dm, 4, n);
+	}
+
+	/**
+	 * 3. K = n+1
+	 *
+	 *     NOTE: Cannot overflow, since 2^64-1 is eliminated by strong Fermat
+	 *           base-2 test.
+	 */
+
+	K = n+1;
+
+	/**
+	 * 4. Let Kr, Kr–1, ..., K0 be the binary expansion of K, with Kr = 1.
+	 */
+
+	r = 63-gaIClz(K);
+
+	/**
+	 * 5. Set Ur = 1 and Vr = 1.
+	 */
+
+	U = V = 1;
+
+	/**
+	 * 6. For i=r–1 to 0, do
+	 */
+
+	for(i=r-1;i>=0;i--){
+		Ut = gaIMulMod(U,V,n);
+		Vt = gaIAvgMod(gaIMulMod(V,V,n), gaIMulMod(D,gaIMulMod(U,U,n),n), n);
+		if((K>>i)&1){
+			U = gaIAvgMod(Ut,Vt,n);
+			V = gaIAvgMod(Vt,gaIMulMod(D,Ut,n),n);
+		}else{
+			U = Ut;
+			V = Vt;
+		}
+	}
+
+	/**
+	 * 7. If U0==0, then return "probably prime". Otherwise, return "composite".
+	 */
+
+	return U==0 ? GA_IS_PROBABLY_PRIME : GA_IS_COMPOSITE;
+}
+
+int      gaIIsPrime   (uint64_t n){
+	int            hasNoSmallFactors, hasSmallFactors;
+
+	/**
+	 * Check if it is 2, the oddest prime.
+	 */
+
+	if(n==2){return GA_IS_PRIME;}
+
+	/**
+	 * Check if it is an even integer.
+	 */
+
+	if((n&1) == 0){return GA_IS_COMPOSITE;}
+
+	/**
+	 * For small integers, read directly the answer in a table.
+	 */
+
+	if(n<256){
+		return "nnyynynynnnynynnnynynnnynnnnnyny"
+		       "nnnnnynnnynynnnynnnnnynnnnnynynn"
+		       "nnnynnnynynnnnnynnnynnnnnynnnnnn"
+		       "nynnnynynnnynynnnynnnnnnnnnnnnny"
+		       "nnnynnnnnynynnnnnnnnnynynnnnnynn"
+		       "nnnynnnynnnnnynnnnnynynnnnnnnnny"
+		       "nynnnynynnnnnnnnnnnynnnnnnnnnnny"
+		       "nnnynynnnynnnnnynynnnnnnnnnynnnn"[n] == 'y';
+	}
+
+	/**
+	 * Test small prime factors.
+	 */
+
+	hasNoSmallFactors = n% 3 && n% 5 && n% 7 && n%11 && n%13 && n%17 && n%19 &&
+	                    n%23 && n%29 && n%31 && n%37 && n%41 && n%43 && n%47 &&
+	                    n%53 && n%59 && n%61 && n%67 && n%71 && n%73 && n%79;
+	hasSmallFactors   = !hasNoSmallFactors;
+	if(hasSmallFactors){
+		return GA_IS_COMPOSITE;
+	}
+
+	/**
+	 * We implement the Baillie-Pomerance-Selfridge-Wagstaff primality checker.
+	 *   1) A Fermat base-2 strong probable prime that is also
+	 *   2) A Lucas strong probable prime is
+	 *   3) Prime.
+	 * The BPSW test has no known failure cases and is proven to have no failures
+	 * for all numbers under 2^64. It is expected to have failures (composites
+	 * classified as "probably prime") but they are expected to be enormous.
+	 *
+	 * We begin with the Fermat base-2 strong primality test
+	 * (Miller-Rabin test with one witness only, a=2).
+	 */
+
+	return gaIIsPrimeStrongFermat(n,          2) &&
+
+	/**
+	 * Assuming this is one of the base-2 Fermat strong probable primes, we run
+	 * the Lucas primality test with Selfridge's Method A for selecting D.
+	 */
+
+	       gaIIsPrimeStrongLucas (n            );
+}
+
+int      gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl){
+	int      infiniteSlack,  finiteSlack,   greaterThanMaxN,
+	         exactFactoring, noKSmoothness, kSmoothness;
+	uint64_t i, x, newX, p, f, c;
+
+
+	/**
+	 * Insane argument handling.
+	 */
+
+	if(!fl || (k == 1) || (maxN > 0 && maxN < n)){
+		return 0;
+	}
+
+
+	/**
+	 * Handle special cases of n = 0,1,2.
+	 */
+
+	if(n<=2){
+		gaIFLInit(fl);
+		gaIFLAddFactors(fl, n, 1);
+		return 1;
+	}
+
+
+	/**
+	 * Magic-value arguments interpreted and canonicalized.
+	 */
+
+	exactFactoring  = (maxN == (uint64_t) 0);
+	infiniteSlack   = (maxN == (uint64_t)-1);
+	noKSmoothness   = (k    == 0) || (k >= n);
+	finiteSlack     = !infiniteSlack;
+	kSmoothness     = !noKSmoothness;
+	maxN            = exactFactoring ? n : maxN;
+	k               = noKSmoothness  ? n :    k;
+
+
+	/**
+	 * Try optimal k-smooth optimizers.
+	 */
+
+	if     (k <= 2){gaIFactorize2Smooth(n, fl);}
+	else if(k <= 4){gaIFactorize3Smooth(n, fl);}
+	else           {gaIFactorize5Smooth(n, fl);}
+	greaterThanMaxN = finiteSlack && (gaIFLIsOverflowed(fl)       ||
+	                                  gaIFLGetProduct  (fl) > maxN);
+	if(greaterThanMaxN){
+		if(kSmoothness && k<=6){
+			/**
+			 * We've *proven* there exists no k-smooth n <= maxN, k <= 6.
+			 * No use wasting more time here.
+			 */
+
+			return 0;
+		}
+
+		/* Otherwise fall-through to factorizer. */
+	}else{
+		/**
+		 * Either the slack was infinite, or the product did not overflow and
+		 * was <= maxN. The k-smoothness criterion is guaranteed by the
+		 * factorizer we chose earlier.
+		 *
+		 * Therefore we have a satisfactory, optimal 2-, 3- or 5-smooth
+		 * factorization (although not necessarily an exact one unless it is
+		 * the case that maxN == n). We return it.
+		 */
+
+		return 1;
+	}
+
+
+	/**
+	 * Master loop.
+	 *
+	 * We arrive here with finite slack and all optimal 2-, 3- and 5-smooth
+	 * factorizers unable to produce a factorization whose product is less
+	 * than or equal to maxN.
+	 */
+
+	for(i=n; i <= maxN; i++){
+		/**
+		 * Do not manipulate the loop index!
+		 * Initial subfactor to cut down is x=i.
+		 */
+
+		x = i;
+		gaIFLInit(fl);
+
+		/**
+		 * Subfactorization always begins with an attempt at an initial
+		 * cut-down by factors of 2. Should this result in a 1 (which isn't
+		 * technically prime, but indicates a complete factorization), we
+		 * report success.
+		 */
+
+		subfactorize:
+		gaIFLAddFactors(fl, 2, gaICtz(x));
+		x >>= gaICtz(x);
+		f = 3;
+
+		/**
+		 * Primality test.
+		 *
+		 * If the remaining factor x is a prime number, it's decision time. One
+		 * of two things is true:
+		 *
+		 *  1) We have a smoothness constraint k and x is <= than it, or we
+		 *     don't have a smoothness constraint at all (k==n). Both cases are
+		 *     covered by checking x<=k.
+		 *
+		 *     In this case we add x as the last factor to the factor list and
+		 *     return affirmatively.
+		 *
+		 *  2) We have a smoothness constraint and x>k.
+		 *
+		 *     In this case we have to inc/decrement x and begin anew the
+		 *     sub-factorization. This may cause us to fail out of factorizing
+		 *     the current i, by exceeding our slack limit. If this happens we
+		 *     abort the factorization rooted at i and move to the next i.
+		 */
+
+		primetest:
+		if(x==1 || gaIIsPrime(x)){
+			if(x <= k){
+				gaIFLAddFactors(fl, x, 1);
+				return 1;
+			}else{
+				p     = gaIFLGetProduct(fl);
+				newX  = n/p;
+				newX += newX*p < n;
+				if(newX < x){
+					x = newX;
+					goto subfactorize;
+				}else if((maxN - p*x) < p){/* Overflow-free check maxN >= p*(x+1) */
+					goto nextI;
+				}else{
+					x++;
+					goto subfactorize;
+				}
+			}
+		}
+
+		/**
+		 * Composite number handler.
+		 *
+		 * We continue by trying to cut down x by factors of 3+. Should a trial
+		 * division by a factor f succeed, all powers of f are factored out of
+		 * x and once f no longer divides x evenly, a new primality test is
+		 * run. The primality test will be invoked at most 15 times from this loop.
+		 */
+
+		for(;f<=k && f*f<=x && f<=0xFFFFFFFFU;f+=2){/* Overflow-safe f*f */
+			if(x%f == 0){
+				c = 0;
+				do{
+					x /= f;
+					c++;
+				}while(x%f == 0);
+
+				gaIFLAddFactors(fl, f, c);
+
+				goto primetest;
+			}
+		}
+
+		/* Check before next iteration for 64-bit integer overflow. */
+		nextI: if(i == 0xFFFFFFFFFFFFFFFF){break;}
+	}
+
+	/* Failed to factorize. */
+	return 0;
+}
+
+static int      gaIFactorize2Smooth(uint64_t n, ga_factor_list* fl){
+	n--;
+	n |= n >>  1;
+	n |= n >>  2;
+	n |= n >>  4;
+	n |= n >>  8;
+	n |= n >> 16;
+	n |= n >> 32;
+	n++;
+
+	gaIFLInit(fl);
+	gaIFLAddFactors(fl, 2, gaICtz(n));
+
+	return 1;
+}
+
+static int      gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl){
+	uint64_t nBest=-1, i3Best=0, i3, p3, nCurr;
+	int nlz = gaIClz(n), isBest2to64 = 1;
+
+	/**
+	 * Iterate over all powers of 3, scaling them by the least power-of-2 such
+	 * that the result is greater than or equal to n. Report the smallest nBest
+	 * so obtained.
+	 */
+
+	for(i3=0, p3=1;i3<=40;i3++, p3*=3){
+		nCurr = p3;
+
+		/**
+		 * If the current power of 3 is >= n, then this must be the last
+		 * iteration, but perhaps a pure power of 3 is the best choice, so
+		 * check for this.
+		 */
+
+		if(nCurr >= n){
+			if(isBest2to64 || nBest >= nCurr){
+				isBest2to64 = 0;
+				nBest       = nCurr;
+				i3Best      = i3;
+			}
+			break;
+		}
+
+		/**
+		 * Otherwise we have a pure power of 3, p3, less than n, and must
+		 * derive the least power of 2 such that p3 multiplied by that power of
+		 * 2 is greater than or equal to n. We then compute the product of
+		 * both.
+		 */
+
+		nCurr <<= gaIClz(nCurr) - nlz;
+		if(nCurr<n){
+			/**
+			 * The line above only guarantees we get a value within a factor of
+			 * 2 from n. We may have to boost nCurr by another factor of 2, if
+			 * this is still possible without overflow.
+			 */
+
+			nCurr<<=1;
+			if(nCurr<n){
+				/**
+				 * If we enter this branch, overflow occured. Moreover, we know
+				 * that (before overflow) it was the case that 2^63 <= nCurr < n,
+				 * and thus 2**64 is a superior factorization to this one. Skip.
+				 */
+
+				continue;
+			}
+		}
+
+		/**
+		 * By here we know that nCurr is >= n. But is it the best factorization
+		 * so far?
+		 */
+
+		if(isBest2to64 || nBest >= nCurr){
+			isBest2to64 = 0;
+			nBest       = nCurr;
+			i3Best      = i3;
+
+			if(nCurr == n){
+				break;
+			}
+		}
+	}
+
+
+	/**
+	 * Return the smallest n found above.
+	 *
+	 * nBest and i3Best must be set.
+	 */
+
+	gaIFLInit(fl);
+	if(isBest2to64){
+		gaIFLAddFactors(fl, 2, 64);
+	}else{
+		gaIFLAddFactors(fl, 2, gaICtz(nBest));
+		gaIFLAddFactors(fl, 3, i3Best);
+	}
+	return 1;
+}
+
+static int      gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl){
+	uint64_t nBest=-1, i3Best=0, i3, p3, i5Best=0, i5, p5, nCurr;
+	int nlz = gaIClz(n), isBest2to64 = 1;
+
+	/**
+	 * Iterate over all products of powers of 5 and 3, scaling them by the
+	 * least power-of-2 such that the result is greater than or equal to n.
+	 * Report the smallest nBest so obtained.
+	 */
+
+	for(i5=0, p5=1;i5<=27;i5++, p5*=5){
+		nCurr = p5;
+
+		/**
+		 * If the current power of 5 is >= n, then this must be the last
+		 * iteration, but perhaps a pure power of 5 is the best choice, so
+		 * check for this.
+		 */
+
+		if(nCurr >= n){
+			if(isBest2to64 || nBest >= nCurr){
+				isBest2to64 = 0;
+				nBest       = nCurr;
+				i3Best      = 0;
+				i5Best      = i5;
+			}
+			break;
+		}
+
+		for(i3=0, p3=1;i3<=40;i3++, p3*=3){
+			/**
+			 * Detect when the product p3*p5 would overflow 2^64.
+			 */
+
+			if(i3){
+				nCurr = (p3/3)*p5;
+				if(nCurr+nCurr < nCurr || nCurr+nCurr+nCurr < nCurr+nCurr){
+					break;
+				}
+			}
+			nCurr = p3*p5;
+
+			/**
+			 * If the current product of powers of 3 and 5 is >= n, then this
+			 * must be the last iteration, but perhaps a pure product of powers
+			 * of 3 and 5 is the best choice, so check for this.
+			 */
+
+			if(nCurr >= n){
+				if(isBest2to64 || nBest >= nCurr){
+					isBest2to64 = 0;
+					nBest       = nCurr;
+					i3Best      = i3;
+					i5Best      = i5;
+				}
+				break;
+			}
+
+			/**
+			 * Otherwise we have a number nCurr, composed purely of factors 3
+			 * and 5, that is less than n. We must derive the least power of 2
+			 * such that nCurr multiplied by that power of 2 is greater than or
+			 * equal to n. We then compute the product of both.
+			 */
+
+			nCurr <<= gaIClz(nCurr) - nlz;
+			if(nCurr<n){
+				/**
+				 * The line above only guarantees we get a value within a factor of
+				 * 2 from n. We may have to boost nCurr by another factor of 2, if
+				 * this is still possible without overflow.
+				 */
+
+				nCurr<<=1;
+				if(nCurr<n){
+					/**
+					 * If we enter this branch, overflow occured. Moreover, we know
+					 * that (before overflow) it was the case that 2^63 <= nCurr < n,
+					 * and thus 2**64 is a superior factorization to this one. Skip.
+					 */
+
+					continue;
+				}
+			}
+
+			/**
+			 * By here we know that nCurr is >= n. But is it the best factorization
+			 * so far?
+			 */
+
+			if(isBest2to64 || nBest >= nCurr){
+				isBest2to64 = 0;
+				nBest       = nCurr;
+				i3Best      = i3;
+				i5Best      = i5;
+
+				if(nCurr == n){
+					goto exit;
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Return the smallest n found above.
+	 *
+	 * nBest and i3Best must be set.
+	 */
+
+    exit:
+	gaIFLInit(fl);
+	if(isBest2to64){
+		gaIFLAddFactors(fl, 2, 64);
+	}else{
+		gaIFLAddFactors(fl, 2, gaICtz(nBest));
+		gaIFLAddFactors(fl, 3, i3Best);
+		gaIFLAddFactors(fl, 5, i5Best);
+	}
+	return 1;
+}
+
+void     gaIFLInit(ga_factor_list* fl){
+	memset(fl, 0, sizeof(*fl));
+}
+
+int      gaIFLFull(const ga_factor_list* fl){
+	return fl->d >= 15;/* Strictly speaking, fl->d never exceeds 15. */
+}
+
+int      gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){
+	int i;
+
+	/**
+	 * Fast case: We're adding 0 powers of f, or any powers of 1. The
+	 * value of the factor list (and the integer it represents) is thus
+	 * unchanged.
+	 */
+
+	if(p == 0 || f == 1){
+		return 1;
+	}
+
+	/**
+	 * Otherwise, the factor list has to change. We scan linearly the factor
+	 * list for either a pre-existing spot or an insertion spot. Scanning
+	 * linearly over a 15-element array is faster and less complex than binary
+	 * search.
+	 */
+
+	for(i=0;i<fl->d;i++){
+		if(fl->f[i] == f){
+			/**
+			 * Factor is already in list.
+			 */
+
+			fl->p[i] += p;
+			if(fl->p[i] == 0){
+				/**
+				 * We removed all factors f. Bump leftwards the remainder to
+				 * maintain sorted order.
+				 */
+
+				memmove(&fl->f[i], &fl->f[i+1], sizeof(fl->f[i])*(fl->d-i));
+				memmove(&fl->p[i], &fl->p[i+1], sizeof(fl->p[i])*(fl->d-i));
+				fl->d--;
+			}
+			return 1;
+		}else if(fl->f[i] > f){
+			/* Inject the factor at this place in order to keep list sorted,
+			   if we have the capacity. */
+
+			if(gaIFLFull(fl)){
+				/* We can't bump the list rightwards, it's full already! */
+				return 0;
+			}
+
+			memmove(&fl->f[i+1], &fl->f[i], sizeof(fl->f[i])*(fl->d-i));
+			memmove(&fl->p[i+1], &fl->p[i], sizeof(fl->p[i])*(fl->d-i));
+			fl->f[i] = f;
+			fl->p[i] = p;
+			fl->d++;
+			return 1;
+		}
+	}
+
+	/**
+	 * We looked at every factor in the list and f is strictly greater than
+	 * all of them.
+	 *
+	 * If the list is full, we cannot insert f, but if it isn't, we can simply
+	 * tack it at the end.
+	 */
+
+	if(gaIFLFull(fl)){
+		return 0;
+	}else{
+		fl->f[fl->d] = f;
+		fl->p[fl->d] = p;
+		fl->d++;
+		return 1;
+	}
+}
+
+int      gaIFLGetFactorPower(const ga_factor_list* fl, uint64_t f){
+	int i;
+
+	for(i=0;i<fl->d;i++){
+		if(fl->f[i] == f){
+			return fl->p[i];
+		}
+	}
+
+	return 0;
+}
+
+uint64_t gaIFLGetProduct(const ga_factor_list* fl){
+	uint64_t p = 1;
+	int i, j;
+
+	for(i=0;i<fl->d;i++){
+		for(j=0;j<fl->p[i];j++){
+			p *= fl->f[i];
+		}
+	}
+
+	return p;
+}
+
+int      gaIFLIsOverflowed(const ga_factor_list* fl){
+	uint64_t p = 1, MAX=-1;
+	int i, j;
+
+	if(gaIFLGetFactorPower(fl, 0) >=  1){
+		return 0;
+	}
+	if(gaIFLGetFactorPower(fl, 2) >= 64){
+		return 1;
+	}
+
+	for(i=0;i<fl->d;i++){
+		for(j=0;j<fl->p[i];j++){
+			if(MAX/p < fl->f[i]){
+				return 1;
+			}
+			p *= fl->f[i];
+		}
+	}
+
+	return 0;
+}
+
+uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl){
+	return fl->d ? fl->f[fl->d-1] : 1;
+}
+
+uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl){
+	return fl->d ? fl->f[0]         : 1;
+}
+
+static uint64_t gaIFLGetProductv(int n, const ga_factor_list* fl){
+	uint64_t p = 1;
+	int i;
+
+	for(i=0;i<n;i++){
+		p *= gaIFLGetProduct(fl+i);
+	}
+
+	return p;
+}
+
+static uint64_t gaIFLGetGreatestFactorv(int n, const ga_factor_list* fl, int* idx){
+	uint64_t f = 0, currF;
+	int i, hasFactors=0;
+
+	if(idx){*idx = 0;}
+
+	for(i=0;i<n;i++){
+		if(fl[i].d > 0){
+			hasFactors = 1;
+			currF = gaIFLGetGreatestFactor(fl+i);
+			if(f <= currF){
+				f = currF;
+				if(idx){*idx = i;}
+			}
+		}
+	}
+
+	return hasFactors ? f : 1;
+}
+
+static uint64_t gaIFLGetSmallestFactorv(int n, const ga_factor_list* fl, int* idx){
+	uint64_t f = -1, currF;
+	int i, hasFactors=0;
+
+	if(idx){*idx = 0;}
+
+	for(i=0;i<n;i++){
+		if(fl[i].d > 0){
+			hasFactors = 1;
+			currF = gaIFLGetSmallestFactor(fl+i);
+			if(f >= currF){
+				f = currF;
+				if(idx){*idx = i;}
+			}
+		}
+	}
+
+	return hasFactors ? f : 1;
+}
+
+int      gaIFLsprintf(char* str, const ga_factor_list* fl){
+	int    i, j;
+	int    total = 0;
+	char*  ptr   = str;
+
+	/* Loop over all factors and spit them out. */
+	for(i=0;i<fl->d;i++){
+		for(j=0;j<fl->p[i];j++){
+			total += sprintf(ptr, "%llu*", (unsigned long long)fl->f[i]);
+			if(ptr){
+				ptr   += strlen(ptr);
+			}
+		}
+	}
+
+	/* If no factors were printed, print 1. */
+	if(total == 0){
+		total += sprintf(ptr, "1*");
+		if(ptr){
+			ptr   += strlen(ptr);
+		}
+	}
+
+	/* Terminate buffer ('*' -> '\0') and deduct one character. */
+	total--;
+	if(str){
+		str[total]  = '\0';
+	}
+
+	return total;
+}
+
+void gaIFLappend(strb *sb, const ga_factor_list* fl){
+	int  i, j;
+	int  noFactorsPrinted = 1;
+
+	/* Loop over all factors and spit them out. */
+	for(i=0;i<fl->d;i++){
+		for(j=0;j<fl->p[i];j++){
+			noFactorsPrinted = 0;
+			strb_appendf(sb, "%llu*", (unsigned long long)fl->f[i]);
+		}
+	}
+
+	/**
+	 * If no factors were printed, print 1.
+	 * Otherwise, delete final '*'.
+	 */
+
+	if(noFactorsPrinted){
+		strb_appendf(sb, "1");
+	}else{
+		sb->s[--sb->l] = '\0';
+	}
+}
+
+static void     gaIScheduleChecked(const int       n,
+                                   const uint64_t  maxBtot,
+                                   const uint64_t* maxBind,
+                                   const uint64_t  maxGtot,
+                                   const uint64_t* maxGind,
+                                   uint64_t*       bs,
+                                   uint64_t*       gs,
+                                   uint64_t*       cs){
+	int      i;
+	uint64_t kBS, kGS, k;
+
+	/**
+	 * Allocate a VLA or similar.
+	 *
+	 * C89 neither allows VLAs nor a check beforehand that n>0 to avoid UB. The
+	 * check for n>0 was thus done in our caller.
+	 */
+
+#if GA_USING_MALLOC_FOR_VLA
+	ga_factor_list* factBS = malloc(n * sizeof(*factBS));
+	ga_factor_list* factGS = malloc(n * sizeof(*factGS));
+	ga_factor_list* factCS = malloc(n * sizeof(*factCS));
+#else
+	ga_factor_list factBS[n];
+	ga_factor_list factGS[n];
+	ga_factor_list factCS[n];
+#endif
+
+
+
+
+	/**
+	 * Factorize the provided integers under their k-smoothness constraint.
+	 * Use the strictest of either the block or grid constraints on each
+	 * dimension.
+	 */
+
+	for(i=0;i<n;i++){
+		kBS = maxBtot < maxBind[i] ? maxBtot : maxBind[i];
+		kGS = maxGtot < maxGind[i] ? maxGtot : maxGind[i];
+		k   =   kBS   <     kGS    ?   kBS   :     kGS;
+
+		gaIFactorize(bs[i], -1, k, factBS+i);
+		gaIFactorize(gs[i], -1, k, factGS+i);
+		gaIFactorize(cs[i], -1, k, factCS+i);
+	}
+
+	/**
+	 * Invoke scheduler core with factor-list version of our arguments.
+	 */
+
+	gaIFLSchedule(n,
+	              maxBtot,
+	              maxBind,
+	              maxGtot,
+	              maxGind,
+	              factBS,
+	              factGS,
+	              factCS);
+
+
+	/**
+	 * Convert factor lists to products and place them in output arguments.
+	 */
+
+	for(i=0;i<n;i++){
+		bs[i] = gaIFLGetProduct(factBS+i);
+		gs[i] = gaIFLGetProduct(factGS+i);
+		cs[i] = gaIFLGetProduct(factCS+i);
+	}
+
+
+	/**
+	 * Eliminate VLA-like storage if it was allocated with malloc().
+	 */
+
+#if GA_USING_MALLOC_FOR_VLA
+	free(factBS);
+	free(factGS);
+	free(factCS);
+#endif
+}
+
+void     gaISchedule(const int       n,
+                     const uint64_t  maxBtot,
+                     const uint64_t* maxBind,
+                     const uint64_t  maxGtot,
+                     const uint64_t* maxGind,
+                     uint64_t*       bs,
+                     uint64_t*       gs,
+                     uint64_t*       cs){
+	if(n<=0){return;}
+
+	gaIScheduleChecked(n,
+	                   maxBtot,
+	                   maxBind,
+	                   maxGtot,
+	                   maxGind,
+	                   bs,
+	                   gs,
+	                   cs);
+}
+
+void     gaIFLSchedule(const int       n,
+                       const uint64_t  maxBtot,
+                       const uint64_t* maxBind,
+                       const uint64_t  maxGtot,
+                       const uint64_t* maxGind,
+                       ga_factor_list* factBS,
+                       ga_factor_list* factGS,
+                       ga_factor_list* factCS){
+	/**
+	 * If we have zero dimensions, the scheduling job is easy.
+	 */
+
+	if(n<=0){return;}
+
+	/**
+	 * First, we move factors from factBS[i] and factGS[i] to factCS[i], in
+	 * order of largest to smallest, until their product is at or below
+	 * maxBind[i] and maxGind[i] respectively.
+	 */
+
+	gaIFLScheduleSatisfyInd(n, factBS, factCS, maxBind);
+	gaIFLScheduleSatisfyInd(n, factGS, factCS, maxGind);
+
+	/**
+	 * Then we move out more factors from factBS[i] and factGS[i], in order of
+	 * smallest to largest, until their common product is at or below maxBtot
+	 * and maxGtot respectively.
+	 */
+
+	gaIFLScheduleSatisfyTot(n, factBS, factCS, maxBtot);
+	gaIFLScheduleSatisfyTot(n, factGS, factCS, maxGtot);
+
+	/**
+	 * At this point, the scheduling is guaranteed to be valid, but may be
+	 * nowhere close to optimal.
+	 *
+	 * So we start moving in factors from factCS[i] to factBS[i], in order of
+	 * largest to smallest, while remaining below maxBtot and maxBind[i].
+	 *
+	 * Lastly, we move in factors from factCS[i] to factBG[i], in order of
+	 * largest to smallest, while remaining below maxGtot and maxGind[i].
+	 */
+
+	gaIFLScheduleOpt(n, factCS, factBS, maxBtot, maxBind);
+	gaIFLScheduleOpt(n, factCS, factGS, maxGtot, maxGind);
+}
+
+static void     gaIFLScheduleSatisfyInd(const int       n,
+                                        ga_factor_list* from,
+                                        ga_factor_list* to,
+                                        const uint64_t* maxInd){
+	int      i;
+	uint64_t f, p;
+
+	for(i=0;i<n;i++){
+		p = gaIFLGetProduct       (from+i);
+		f = gaIFLGetGreatestFactor(from+i);
+		while(p > maxInd[i]){
+			if(p%f){
+				f  = gaIFLGetGreatestFactor(from+i);
+			}
+			p /= f;
+			gaIFLAddFactors(from+i, f, -1);
+			gaIFLAddFactors(to  +i, f, +1);
+		}
+	}
+}
+
+static void     gaIFLScheduleSatisfyTot(const int       n,
+                                        ga_factor_list* from,
+                                        ga_factor_list* to,
+                                        const uint64_t  maxTot){
+	int      a, i, c;
+	uint64_t f, p;
+
+	p = gaIFLGetProductv(n, from);
+	a = 0;
+
+	while(p > maxTot){
+		f = gaIFLGetSmallestFactorv(n, from, &a);
+		c = gaIFLGetFactorPower    (from+a, f);
+
+		for(i=c-1;i>=0 && p>maxTot;i--){
+			p /= f;
+			gaIFLAddFactors(from+a, f, -1);
+			gaIFLAddFactors(to  +a, f, +1);
+		}
+	}
+}
+
+static void     gaIFLScheduleOpt(const int       n,
+                                 ga_factor_list* from,
+                                 ga_factor_list* to,
+                                 const uint64_t  maxTot,
+                                 const uint64_t* maxInd){
+	int i, j, k;
+	uint64_t maxFTot, maxFInd, currF, f, pTot = 1;
+#if GA_USING_MALLOC_FOR_VLA
+	uint64_t* pInd = malloc(n * sizeof(*pInd));
+#else
+	uint64_t  pInd[n];
+#endif
+
+	/* Muzzle compiler about a random function being unused. */
+	(void)gaIFLGetGreatestFactorv;
+
+	/**
+	 * Check whether optimization is possible.
+	 */
+
+	for(i=0;i<n;i++){
+		pTot *= pInd[i] = gaIFLGetProduct(to+i);
+	}
+	maxFTot = maxTot/pTot;
+	if(maxFTot <= 1){
+		return;
+	}
+
+	/* Optimize. */
+	do{
+		/**
+		 * At the beginning of each iteration, maxFTot is preset to maxTot/p,
+		 * the largest factor that can legitimately be added into `to` without
+		 * exceeding the *global* limit.
+		 *
+		 * We select, amongst all dimensions, the largest f such that
+		 *     f <= maxFTot     and
+		 *     f <= maxFInd[k]
+		 * and record both f and k.
+		 */
+
+		f =  1;
+		k = -1;
+		for(i=0;i<n;i++){
+			maxFInd = maxInd[i]/pInd[i];
+
+			for(j=from[i].d-1;j>=0;j--){
+				currF = from[i].f[j];
+
+				if(currF <= maxFTot && currF <= maxFInd && currF >= f){
+					f = currF;
+					k = i;
+					break;
+				}
+			}
+		}
+
+		if(k == -1){
+			break;
+		}
+
+		gaIFLAddFactors(from+k, f, -1);
+		gaIFLAddFactors(to  +k, f, +1);
+		pInd[k] *= f;
+		pTot    *= f;
+		maxFTot  = maxTot/pTot;
+	}while(maxFTot>1 && f>1);
+
+#if GA_USING_MALLOC_FOR_VLA
+	free(pInd);
+#endif
+}
diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h
new file mode 100644
index 0000000000..0ca4c14f68
--- /dev/null
+++ b/src/util/integerfactoring.h
@@ -0,0 +1,274 @@
+/* Include Guards */
+#ifndef INTEGERFACTORING_H
+#define INTEGERFACTORING_H
+
+
+/* Includes */
+#include <stdio.h>
+#include "gpuarray/config.h"
+
+#include "util/strb.h"
+
+
+/* Defines */
+
+
+
+/* C++ Extern "C" Guard */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/* Data Structure Prototypes & Typedefs */
+struct ga_factor_list_;
+typedef struct ga_factor_list_ ga_factor_list;
+
+
+
+/* Data Structures */
+
+/**
+ * @brief The GA_FACTOR_LIST struct.
+ *
+ * Contains the list of distinct prime factors of a 64-bit unsigned integer, as
+ * well as the powers of those factors.
+ *
+ * There can be at most 15 such distinct factors, since the product of the
+ * first 16 primes (2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53) exceeds
+ * the maximum unsigned number of 2^64-1. Moreover, there can be at most 63
+ * factors all together, since 2^64 exceeds 2^64-1, so only an 8-bit number is
+ * required to store the powers.
+ *
+ * The 15th (last) element of the factor list is always 0 and has power 0,
+ * and serves as a sort of sentinel.
+ */
+
+struct ga_factor_list_{
+	uint64_t f[16];/* Factors */
+	uint8_t  p[16];/* Powers of factors */
+	int      d;    /* Number of distinct factors. */
+};
+
+
+
+/* Functions */
+
+/**
+ * @brief Checks whether an integer is prime.
+ *
+ * @param [in] n   The integer whose primality is to be checked.
+ * @return 1 if prime; 0 if not prime.
+ *
+ * NB: This is *not* a probabilistic primality checker. For all integers it can
+ *     be given as input, it will correctly report "prime" or "composite".
+ * NB: Internally, this function uses the Miller-Rabin test, which *is*
+ *     probabilistic, and may falsely report a number as prime when in fact it
+ *     is composite. However, this function uses a deterministic set of
+ *     Miller-Rabin "witnesses", which ensures that there are no strong
+ *     probable primes equal to or below 2^64-1 (the size of the input
+ *     argument). This set of witnesses is
+ *
+ *         $$a = 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, and 37$$
+ *
+ *     See https://oeis.org/A014233
+ */
+
+int      gaIIsPrime(uint64_t n);
+
+/**
+ * @brief Factorize a positive integer into a list of factors satisfying
+ * certain properties.
+ *
+ * The function factorizes a 64-bit, positive integer into a list of factors.
+ * This factorization can be made "approximate"; That is, the product of the
+ * factors returned can be slightly greater than the input number. The
+ * maximum increase is controlled by a "slack" parameter maxN, as follows:
+ *
+ *     $$\texttt{n} \le \prod(\mathrm{fact}(\texttt{n}) \le \texttt{maxN}$$
+ *
+ * The advantage of offering some slack to the factorizer is that in return,
+ * the factorizer may succeed in outputting a factorization with smaller
+ * factors. The maxN slack parameter must be 0 or be greater than or equal to
+ * n, but it is completely useless to set it beyond 2n.
+ *
+ * When maxN is equal to -1 (2^64 - 1), or is greater than or equal to 2n, no
+ * upper limit is placed on the output factor list's product, but this
+ * implementation guarantees its product will not exceed 2n. This is because
+ * there always exists a power of two that lies between n and 2n, and since
+ * this factorization involves only powers of the smallest prime (2), it is a
+ * valid factorization under any valid k-smoothness constraint, and so may be
+ * returned.
+ *
+ * When maxN is equal to 0 (no increase in value allowed), an exact factoring
+ * is requested.
+ *
+ * The factorization can also be constrained by a (k)-smoothness constraint.
+ * A k-smooth number n has no prime factors greater than k. If the factorizer
+ * is asked to factor with k-smoothness a number with prime factors greater
+ * than k, it will search, within the slack space, for a slightly larger
+ * number that is k-smooth and return that number's factoring. With maxN == n
+ * and a k-smoothness constraint, this function reports whether or not the
+ * number is k-smooth.
+ *
+ * When k is equal to 0, equal to -1 (2^64 - 1), or is greater than or equal
+ * to n, no k-smoothness constraints are imposed. An exact factoring is
+ * required.
+ *
+ * @param [in]  n       The integer to be factorized. Must be >0.
+ * @param [in]  maxN    The "slack" parameter. The factor list returned will
+ *                      not have a product that exceeds this number.
+ * @param [in]  k       The k-smoothness constraint. k is the largest
+ *                      acceptable factor in the output factor list. The
+ *                      factorizer will, effectively, treat any number all of
+ *                      whose prime factors exceed k as a prime.
+ * @param [out] fl      The output factor list. Does *NOT* need to be
+ *                      initialized.
+ * @return Non-zero if a factorization is found that satisfies both slack and
+ *         smoothness constraints; Zero if no such factorization is found.
+ *         If this function returns zero, the last factor in the factor
+ *         list is not guaranteed to be prime.
+ */
+
+int      gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl);
+
+/**
+ * @brief Initialize a factors list to all-factors- and all-powers-zero.
+ *
+ * Such a factors list represents 1, since 0^0 = 1.
+ */
+
+void     gaIFLInit(ga_factor_list* fl);
+
+/**
+ * @brief Reports whether another *distinct* factor can be added to the factor
+ *        list safely.
+ *
+ * @return Returns zero if there are less than 15 distinct factors in the list
+ *         and non-zero otherwise.
+ */
+
+int      gaIFLFull(const ga_factor_list* fl);
+
+/**
+ * @brief Add a factor f with power p to the factor list.
+ *
+ * If factor f was already present in the factor list, increments
+ * the corresponding power by p. Otherwise, adds the new factor f to
+ * the list, if there is still space, and sets the power to p.
+ *
+ * Maintains factor list in sorted order.
+ *
+ * @return Non-zero if factor successfully added; Zero otherwise.
+ */
+
+int      gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p);
+
+/**
+ * @brief Get the power of a given factor within a factor list.
+ *
+ * @return The number of times a factor occurs within the current
+ *         factorization. If it does not occur, return 0.
+ */
+
+int      gaIFLGetFactorPower(const ga_factor_list* fl, uint64_t f);
+
+/**
+ * @brief Compute the product of the factors stored in the factors list.
+ *
+ * NB: This function may return an overflowed result. To detect if it will,
+ *     please call gaIFLIsOverflowed(fl).
+ */
+
+uint64_t gaIFLGetProduct(const ga_factor_list* fl);
+
+/**
+ * @brief Check whether the factor list produces a number >= 2^64.
+ */
+
+int      gaIFLIsOverflowed(const ga_factor_list* fl);
+
+/**
+ * @brief Get the greatest factor in the factors list.
+ */
+
+uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl);
+
+/**
+ * @brief Get the smallest factor in the factors list.
+ */
+
+uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl);
+
+/**
+ * @brief Print out the factor list in a human-readable form, sprintf()-style.
+ *
+ * @param [out] str   A string into which to print out the factor list. If the
+ *                    factor list is a result of gaIFactorize(), then the
+ *                    maximum length of buffer required is 128 bytes.
+ *                    If str is NULL, nothing is printed.
+ * @param [in]  fl    The factor list to be printed.
+ * @return            The number of characters that would have been printed
+ *                    out, assuming an unbounded, non-NULL buffer.
+ */
+
+int gaIFLsprintf(char* str, const ga_factor_list* fl);
+
+/**
+ * @brief Print out the factor list in a human-readable form.
+ *
+ * @param [out] sb   A string into which to print out the factor list. If the
+ *                   factor list is a result of gaIFactorize(), then the
+ *                   maximum length of buffer required is 128 bytes.
+ * @param [in]  fl   The factor list to be printed.
+ */
+
+void gaIFLappend(strb *sb, const ga_factor_list* fl);
+
+/**
+ * @brief Schedule block size, grid size and what's left over that fits in
+ *        neither, which will be called "chunk" size, subject to constraints.
+ *
+ * @param [in]     n        Number of dimensions of the problem. The arrays
+ *                          maxBind, maxGind, factBS, factGS, factCS must have
+ *                          n elements.
+ * @param [in]     maxBtot  The product of the block sizes in all n dimensions
+ *                          will not exceed this value.
+ * @param [in]     maxBind  The block size in dimensions i=0..n-1 will not
+ *                          exceed maxBind[i].
+ * @param [in]     maxGtot  The product of the grid sizes in all n dimensions
+ *                          will not exceed this value.
+ * @param [in]     maxGind  The grid size in dimensions i=0..n-1 will not
+ *                          exceed maxGind[i].
+ * @param [in,out] factBS   The block size for dimensions 0..n-1, as a factor list.
+ * @param [in,out] factGS   The grid  size for dimensions 0..n-1, as a factor list.
+ * @param [in,out] factCS   The chunk size for dimensions 0..n-1, as a factor list.
+ */
+
+void     gaIFLSchedule(const int       n,
+                       const uint64_t  maxBtot,
+                       const uint64_t* maxBind,
+                       const uint64_t  maxGtot,
+                       const uint64_t* maxGind,
+                       ga_factor_list* factBS,
+                       ga_factor_list* factGS,
+                       ga_factor_list* factCS);
+void     gaISchedule  (const int       n,
+                       const uint64_t  maxBtot,
+                       const uint64_t* maxBind,
+                       const uint64_t  maxGtot,
+                       const uint64_t* maxGind,
+                       uint64_t*       bs,
+                       uint64_t*       gs,
+                       uint64_t*       cs);
+
+
+/* End C++ Extern "C" Guard */
+#ifdef __cplusplus
+}
+#endif
+
+
+/* End Include Guards */
+#endif
diff --git a/src/util/skein.c b/src/util/skein.c
new file mode 100644
index 0000000000..38912e8320
--- /dev/null
+++ b/src/util/skein.c
@@ -0,0 +1,317 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#include <string.h>      /* get the memcpy/memset functions */
+#include "skein.h"       /* get the Skein API definitions   */
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+static const u64b_t SKEIN_512_IV_512[] =
+  {
+    MK_64(0x4903ADFF,0x749C51CE),
+    MK_64(0x0D95DE39,0x9746DF03),
+    MK_64(0x8FD19341,0x27C79BCE),
+    MK_64(0x9A255629,0xFF352CB1),
+    MK_64(0x5DB62599,0xDF6CA7B0),
+    MK_64(0xEABE394C,0xA9D5C3F4),
+    MK_64(0x991112C7,0x1A75B523),
+    MK_64(0xAE18A40B,0x660FCC33)
+  };
+
+static void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) {
+  size_t n;
+
+  for (n = 0; n < bCnt; n++)
+    dst[n] = (u08b_t)(src[n>>3] >> (8*(n&7)));
+}
+
+static void Skein_Get64_LSB_First(u64b_t *dst, const u08b_t *src,
+                                  size_t wCnt) {
+  size_t n;
+
+  for (n=0; n<8*wCnt; n+=8)
+    dst[n/8] = (((u64b_t) src[n  ])) +
+      (((u64b_t) src[n+1]) <<  8) +
+      (((u64b_t) src[n+2]) << 16) +
+      (((u64b_t) src[n+3]) << 24) +
+      (((u64b_t) src[n+4]) << 32) +
+      (((u64b_t) src[n+5]) << 40) +
+      (((u64b_t) src[n+6]) << 48) +
+      (((u64b_t) src[n+7]) << 56) ;
+}
+
+static u64b_t Skein_Swap64(u64b_t in) {
+  u64b_t o;
+  u08b_t *out = (u08b_t *)&o;
+  out[7] = in >> 56;
+  out[6] = in >> 48;
+  out[5] = in >> 40;
+  out[4] = in >> 32;
+  out[3] = in >> 24;
+  out[2] = in >> 16;
+  out[1] = in >> 8;
+  out[0] = in;
+  return o;
+}
+
+/*****************************************************************/
+/* Function to process blkCnt (nonzero) full block(s) of data. */
+#define BLK_BITS        (WCNT*64)               /* some useful definitions for \
+                                                   code here */
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)
+#define ts              (kw + KW_TWK_BASE)
+
+#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+
+static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const u08b_t *blkPtr,
+                             size_t blkCnt, size_t byteCntAdd) {
+  enum {
+      WCNT = SKEIN_512_STATE_WORDS
+  };
+#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+
+  u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+  u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
+  u64b_t  w [WCNT];                           /* local copy of input block */
+
+  Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+  ts[0] = ctx->h.T[0];
+  ts[1] = ctx->h.T[1];
+  do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+    ts[0] += byteCntAdd;                    /* update processed length */
+
+    /* precompute the key schedule for this block */
+    ks[0] = ctx->X[0];
+    ks[1] = ctx->X[1];
+    ks[2] = ctx->X[2];
+    ks[3] = ctx->X[3];
+    ks[4] = ctx->X[4];
+    ks[5] = ctx->X[5];
+    ks[6] = ctx->X[6];
+    ks[7] = ctx->X[7];
+    ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+      ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+    ts[2] = ts[0] ^ ts[1];
+
+    Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+
+    X0   = w[0] + ks[0];                    /* do the first full key injection */
+    X1   = w[1] + ks[1];
+    X2   = w[2] + ks[2];
+    X3   = w[3] + ks[3];
+    X4   = w[4] + ks[4];
+    X5   = w[5] + ks[5] + ts[0];
+    X6   = w[6] + ks[6] + ts[1];
+    X7   = w[7] + ks[7];
+
+    blkPtr += SKEIN_512_BLOCK_BYTES;
+
+    /* run the rounds */
+#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)
+
+#define I512(R)                                                     \
+    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
+    X1   += ks[((R)+2) % 9];                                        \
+    X2   += ks[((R)+3) % 9];                                        \
+    X3   += ks[((R)+4) % 9];                                        \
+    X4   += ks[((R)+5) % 9];                                        \
+    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
+    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
+    X7   += ks[((R)+8) % 9] +     (R)+1;
+
+    {
+
+#define R512_8_rounds(R)  /* do 8 full rounds */  \
+        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
+        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
+        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
+        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
+        I512(2*(R));                              \
+        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
+        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
+        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
+        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
+        I512(2*(R)+1);        /* and key injection */
+
+      R512_8_rounds( 0);
+
+#define R512_Unroll_R(NN) (SKEIN_512_ROUNDS_TOTAL/8 > (NN))
+
+  #if   R512_Unroll_R( 1)
+      R512_8_rounds( 1);
+  #endif
+  #if   R512_Unroll_R( 2)
+      R512_8_rounds( 2);
+  #endif
+  #if   R512_Unroll_R( 3)
+      R512_8_rounds( 3);
+  #endif
+  #if   R512_Unroll_R( 4)
+      R512_8_rounds( 4);
+  #endif
+  #if   R512_Unroll_R( 5)
+      R512_8_rounds( 5);
+  #endif
+  #if   R512_Unroll_R( 6)
+      R512_8_rounds( 6);
+  #endif
+  #if   R512_Unroll_R( 7)
+      R512_8_rounds( 7);
+  #endif
+  #if   R512_Unroll_R( 8)
+      R512_8_rounds( 8);
+  #endif
+  #if   R512_Unroll_R( 9)
+      R512_8_rounds( 9);
+  #endif
+  #if   R512_Unroll_R(10)
+      R512_8_rounds(10);
+  #endif
+  #if   R512_Unroll_R(11)
+      R512_8_rounds(11);
+  #endif
+  #if   R512_Unroll_R(12)
+      R512_8_rounds(12);
+  #endif
+  #if   R512_Unroll_R(13)
+      R512_8_rounds(13);
+  #endif
+  #if   R512_Unroll_R(14)
+      R512_8_rounds(14);
+  #endif
+    }
+
+    /* do the final "feedforward" xor, update context chaining vars */
+    ctx->X[0] = X0 ^ w[0];
+    ctx->X[1] = X1 ^ w[1];
+    ctx->X[2] = X2 ^ w[2];
+    ctx->X[3] = X3 ^ w[3];
+    ctx->X[4] = X4 ^ w[4];
+    ctx->X[5] = X5 ^ w[5];
+    ctx->X[6] = X6 ^ w[6];
+    ctx->X[7] = X7 ^ w[7];
+
+    ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+  }
+  while (--blkCnt);
+  ctx->h.T[0] = ts[0];
+  ctx->h.T[1] = ts[1];
+}
+
+/*****************************************************************/
+/*     512-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein_512_Init(Skein_512_Ctxt_t *ctx) {
+  ctx->h.hashBitLen = 512;         /* output hash bit count */
+  memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X));
+
+  /* Set up to process the data message portion of the hash (default) */
+  Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+  return SKEIN_SUCCESS;
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg,
+                     size_t msgByteCnt) {
+  size_t n;
+
+  Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+  /* process full blocks, if any */
+  if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) {
+    if (ctx->h.bCnt) {                              /* finish up any buffered message data */
+      n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+      if (n) {
+        Skein_assert(n < msgByteCnt);         /* check on our logic here */
+        memcpy(&ctx->bb.b[ctx->h.bCnt],msg,n);
+        msgByteCnt  -= n;
+        msg         += n;
+        ctx->h.bCnt += n;
+      }
+      Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+      Skein_512_Process_Block(ctx,ctx->bb.b,1,SKEIN_512_BLOCK_BYTES);
+      ctx->h.bCnt = 0;
+    }
+    /* now process any remaining full blocks, directly from input message data */
+    if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
+      n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
+      Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+      msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+      msg        += n * SKEIN_512_BLOCK_BYTES;
+    }
+    Skein_assert(ctx->h.bCnt == 0);
+  }
+
+  /* copy any remaining source message data bytes into b[] */
+  if (msgByteCnt) {
+    Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+    memcpy(&ctx->bb.b[ctx->h.bCnt],msg,msgByteCnt);
+    ctx->h.bCnt += msgByteCnt;
+  }
+
+  return SKEIN_SUCCESS;
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) {
+  size_t i,n,byteCnt;
+  u64b_t X[SKEIN_512_STATE_WORDS];
+  Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+  ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+  if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
+    memset(&ctx->bb.b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+  Skein_512_Process_Block(ctx,ctx->bb.b,1,ctx->h.bCnt);  /* process the final block */
+
+  /* now output the result */
+  byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+  /* run Threefish in "counter mode" to generate output */
+  memset(ctx->bb.b,0,sizeof(ctx->bb.b));  /* zero out b[], so it can hold the counter */
+  memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+  for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) {
+    ctx->bb.l[0] = Skein_Swap64((u64b_t) i); /* build the counter block */
+    Skein_Start_New_Type(ctx,OUT_FINAL);
+    Skein_512_Process_Block(ctx,ctx->bb.b,1,sizeof(u64b_t)); /* run "counter mode" */
+    n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+    if (n >= SKEIN_512_BLOCK_BYTES)
+      n  = SKEIN_512_BLOCK_BYTES;
+    Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+    memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+  }
+  return SKEIN_SUCCESS;
+}
+
+int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal) {
+  Skein_512_Ctxt_t ctx;
+  if (Skein_512_Init(&ctx)) return SKEIN_FAIL;
+  if (Skein_512_Update(&ctx, msg, msgByteCnt)) return SKEIN_FAIL;
+  if (Skein_512_Final(&ctx, hashVal)) return SKEIN_FAIL;
+  return SKEIN_SUCCESS;
+}
diff --git a/src/util/skein.h b/src/util/skein.h
new file mode 100644
index 0000000000..b505a51801
--- /dev/null
+++ b/src/util/skein.h
@@ -0,0 +1,149 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_     1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+**
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+**  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+**                            code. If not defined, most error checking
+**                            is disabled (for performance). Otherwise,
+**                            the switch value is interpreted as:
+**                                0: use assert()      to flag errors
+**                                1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stddef.h>                          /* get size_t definition */
+#include <gpuarray/config.h>
+typedef unsigned int uint_t;
+typedef uint8_t  u08b_t;
+typedef uint64_t u64b_t;
+
+enum {
+  SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
+  SKEIN_FAIL            =      1
+};
+
+#define  SKEIN_MODIFIER_WORDS  ( 2)     /* number of modifier (tweak) words */
+
+#define  SKEIN_512_STATE_WORDS ( 8)
+
+#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+
+typedef struct {
+  size_t  hashBitLen;                        /* size of hash result, in bits */
+  size_t  bCnt;                          /* current byte count in buffer b[] */
+  u64b_t  T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
+} Skein_Ctxt_Hdr_t;
+
+typedef struct {                     /* 512-bit Skein hash context structure */
+  Skein_Ctxt_Hdr_t h;                     /* common header context variables */
+  u64b_t  X[SKEIN_512_STATE_WORDS];                    /* chaining variables */
+  union Skein_512_Ctxt_b_u {
+    u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
+    u64b_t l[SKEIN_512_BLOCK_BYTES/8];
+  } bb;
+} Skein_512_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+int  Skein_512_Init  (Skein_512_Ctxt_t *ctx);
+int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal);
+
+/*****************************************************************
+** "Internal" Skein definitions
+**    -- not needed for sequential hashing API, but will be
+**           helpful for other uses of Skein (e.g., tree hash mode).
+**    -- included here so that they can be shared between
+**           reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_MSG      (48)              /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG) /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)       /* output stage */
+
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+/*
+**   Skein macros for setting tweak words, etc.
+**/
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1)         \
+    {                                           \
+    Skein_Set_T0(ctxPtr,(T0));                  \
+    Skein_Set_T1(ctxPtr,(T1));                  \
+    }
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)                         \
+  { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+/**************************************************
+** "Internal" Skein definitions for error checking
+***************************************************/
+
+#include <assert.h>
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
+#define Skein_assert(x)         assert(x)                     /* internal error */
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum {
+  /* Skein_512 round rotation constants */
+  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+  R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+  R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+  R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+  R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+  R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+  R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* ifndef _SKEIN_H_ */
diff --git a/src/util/strb.c b/src/util/strb.c
index b202b5065c..dda9dcdfc2 100644
--- a/src/util/strb.c
+++ b/src/util/strb.c
@@ -1,5 +1,14 @@
-
+#define _CRT_SECURE_NO_WARNINGS
+#include <errno.h>
 #include <stdarg.h>
+#ifdef _MSC_VER
+#include <io.h>
+#define read _read
+#define write _write
+#else
+#include <unistd.h>
+#endif
+
 #include "util/strb.h"
 
 strb *strb_alloc(size_t i) {
@@ -55,3 +64,39 @@ void strb_appendf(strb *sb, const char *f, ...) {
   va_end(ap);
   sb->l += s;
 }
+
+void strb_read(strb *sb, int fd, size_t sz) {
+  ssize_t res;
+  char *b;
+  if (strb_ensure(sb, sz)) return;
+  b = sb->s + sb->l;
+  sb->l += sz;
+  while (sz) {
+    res = read(fd, b, sz);
+    if (res == -1 || res == 0) {
+      if (res == -1 && (errno == EAGAIN || errno == EINTR))
+        continue;
+      strb_seterror(sb);
+      return;
+    }
+    sz -= (size_t)res;
+    b += (size_t)res;
+  }
+}
+
+int strb_write(int fd, strb *sb) {
+  ssize_t res;
+  size_t l = sb->l;
+  char *b = sb->s;
+  while (l) {
+    res = write(fd, b, l);
+    if (res == -1) {
+      if (errno == EAGAIN || errno == EINTR)
+        continue;
+      return -1;
+    }
+    l -= (size_t)res;
+    b += (size_t)res;
+  }
+  return 0;
+}
diff --git a/src/util/strb.h b/src/util/strb.h
index b2f18449d7..3289de5796 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -39,14 +39,14 @@ typedef struct _strb {
  *
  * Returns NULL on error.
  */
-GPUARRAY_LOCAL strb *strb_alloc(size_t s);
+strb *strb_alloc(size_t s);
 
 /*
  * Frees an strb that was dynamically allocated.
  *
  * Don't call this for stack of global declarations, see strb_clear() instead.
  */
-GPUARRAY_LOCAL void strb_free(strb *);
+void strb_free(strb *sb);
 
 /*
  * Return a pointer to a dynamically allocated strb with a default
@@ -96,7 +96,7 @@ static inline void strb_clear(strb *sb) {
  * This should almost never be called directly.  Use strb_ensure()
  * instead.
  */
-GPUARRAY_LOCAL int strb_grow(strb *, size_t s);
+int strb_grow(strb *sb, size_t s);
 
 /*
  * Make sure there is space to store at least `s` bytes of data after
@@ -146,7 +146,7 @@ static inline void strb_appends(strb *sb, const char *s) {
 /*
  * Appends the content of another strb.
  */
-static inline void strb_appendb(strb *sb, strb *sb2) {
+static inline void strb_appendb(strb *sb, const strb *sb2) {
   strb_appendn(sb, sb2->s, sb2->l);
 }
 
@@ -159,7 +159,24 @@ static inline void strb_appendb(strb *sb, strb *sb2) {
  *
  * A format error will place the strb in error mode.
  */
-GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...);
+void strb_appendf(strb *sb, const char *f, ...);
+
+/*
+ * Reads from the file specified by the given file descriptor.
+ *
+ * This will read `sz` bytes from the file descriptor.  Insufficient
+ * data is handled as a read error.
+ *
+ * A read error will place the strb in error mode.
+ */
+void strb_read(strb *sb, int fd, size_t sz);
+
+/*
+ * Write the content of an strb to the specified file descriptor.
+ *
+ * Write errors will be signaled by a nonzero return value.
+ */
+int strb_write(int fd, strb *sb);
 
 /*
  * Returns a C string from the content of the strb.
diff --git a/src/util/xxhash.c b/src/util/xxhash.c
index 58101b0902..438d69e528 100644
--- a/src/util/xxhash.c
+++ b/src/util/xxhash.c
@@ -2,7 +2,7 @@
 xxHash - Fast Hash algorithm
 Copyright (C) 2012-2015, Yann Collet
 
-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php)
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -31,39 +31,6 @@ You can contact the author at :
 - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
-
-/**************************************
-*  Tuning parameters
-**************************************/
-/* XXH_FORCE_MEMORY_ACCESS
- * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
- * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
- * The below switch allow to select different access method for improved performance.
- * Method 0 (default) : use `memcpy()`. Safe and portable.
- * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
- *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
- * Method 2 : direct access. This method is portable but violate C standard.
- *            It can generate buggy code on targets which generate assembly depending on alignment.
- *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
- * See http://stackoverflow.com/a/32095106/646947 for details.
- * Prefer these methods in priority order (0 > 1 > 2)
- */
-#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
-#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
-#    define XXH_FORCE_MEMORY_ACCESS 2
-#  elif defined(__INTEL_COMPILER) || \
-  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
-#    define XXH_FORCE_MEMORY_ACCESS 1
-#  endif
-#endif
-
-/* XXH_ACCEPT_NULL_INPUT_POINTER :
- * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
- * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
- * By default, this option is disabled. To enable it, uncomment below define :
- */
-/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
-
 /* XXH_FORCE_NATIVE_FORMAT :
  * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
  * Results are therefore identical for little-endian and big-endian CPU.
@@ -72,7 +39,7 @@ You can contact the author at :
  * to improve speed for Big-endian CPU.
  * This option has no impact on Little_Endian CPU.
  */
-#define XXH_FORCE_NATIVE_FORMAT 0
+#define XXH_FORCE_NATIVE_FORMAT 1
 
 /* XXH_USELESS_ALIGN_BRANCH :
  * This is a minor performance trick, only useful with lots of very small keys.
@@ -132,25 +99,6 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
 #endif
 
 
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
-
-/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
-
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
-
-/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
-/* currently only defined for gcc and icc */
-typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
-
-static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
-
-#else
-
-/* portable and safe solution. Generally efficient.
- * see : http://stackoverflow.com/a/32095106/646947
- */
-
 static U32 XXH_read32(const void* memPtr)
 {
     U32 val;
@@ -158,8 +106,6 @@ static U32 XXH_read32(const void* memPtr)
     return val;
 }
 
-#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
-
 
 /******************************************
 *  Compiler-specific Functions and Macros
@@ -243,14 +189,6 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH
     U32 h32;
 #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL)
-    {
-        len=0;
-        bEnd=p=(const BYTE*)(size_t)16;
-    }
-#endif
-
     if (len>=16)
     {
         const BYTE* const limit = bEnd - 16;
diff --git a/src/util/xxhash.h b/src/util/xxhash.h
index c33938234d..f88ff81369 100644
--- a/src/util/xxhash.h
+++ b/src/util/xxhash.h
@@ -6,7 +6,7 @@
    Header File
    Copyright (C) 2012-2015, Yann Collet.
 
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+   BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php)
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -74,39 +74,11 @@ extern "C" {
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
-/*****************************
-*  Namespace Emulation
-*****************************/
-/* Motivations :
-
-If you need to include xxHash into your library,
-but wish to avoid xxHash symbols to be present on your library interface
-in an effort to avoid potential name collision if another library also includes xxHash,
-
-you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
-with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
-
-Note that no change is required within the calling program :
-it can still call xxHash functions using their regular name.
-They will be automatically translated by this header.
-*/
-#ifdef XXH_NAMESPACE
-#  define XXH_CAT(A,B) A##B
-#  define XXH_NAME2(A,B) XXH_CAT(A,B)
-#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
-#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
-#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
-#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
-#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
-#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
-#endif
-
-
 /*****************************
 *  Simple Hash Functions
 *****************************/
 
-GPUARRAY_LOCAL unsigned int XXH32 (const void* input, size_t length, unsigned seed);
+unsigned int XXH32 (const void* input, size_t length, unsigned seed);
 
 /*
 XXH32() :
@@ -129,9 +101,9 @@ These structures allow static allocation of XXH states.
 States must then be initialized using XXH32_reset() before first use.
 */
 
-GPUARRAY_LOCAL XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
-GPUARRAY_LOCAL XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
-GPUARRAY_LOCAL unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
+XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
+XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
 
 /*
 These functions calculate the xxHash of an input provided in multiple smaller packets,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 62e27f58bf..97bf15a307 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,23 +1,56 @@
+include(CheckCSourceCompiles)
+include(CheckLibraryExists)
 find_package(PkgConfig)
 
 pkg_search_module(CHECK check)
 
-if(NOT CHECK_FOUND)
-
+if(CHECK_FOUND)
+ if(CHECK_VERSION VERSION_LESS 0.10.0)
+        MESSAGE( "Check version older than 0.10.0" )
+        set(CHECK_FOUND 0)
+ endif()
+else()
   find_path(CHECK_INCLUDE_DIRS check.h)
   find_library(CHECK_LIBRARIES NAMES check)
-
   if(CHECK_INCLUDE_DIRS AND CHECK_LIBRARIES)
     set(CHECK_CFLAGS)
     set(CHECK_LIBRARY_DIRS)
     set(CHECK_FOUND 1)
   endif()
-
+  if(CHECK_FOUND)
+    set(CMAKE_REQUIRED_FLAGS ${CHECK_C_FLAGS} ${CHECK_LDFLAGS_OTHERS})
+    set(CMAKE_REQUIRED_INCLUDES ${CHECK_INCLUDE_DIRS})
+    CHECK_LIBRARY_EXISTS(pthread pthread_create "" HAVE_PTHREAD)
+    if (HAVE_PTHREAD)
+      set(CHECK_LIBRARIES ${CHECK_LIBRARIES} pthread)
+    endif (HAVE_PTHREAD)
+    CHECK_LIBRARY_EXISTS(rt nanosleep "" HAVE_LIBRT)
+    if (HAVE_LIBRT)
+      set(CHECK_LIBRARIES ${CHECK_LIBRARIES} rt)
+    endif (HAVE_LIBRT)
+    CHECK_LIBRARY_EXISTS(m cos "" HAVE_LIBM)
+    if (HAVE_LIBM)
+      set(CHECK_LIBRARIES ${CHECK_LIBRARIES} m)
+    endif (HAVE_LIBM)
+    set(CMAKE_REQUIRED_LIBRARIES ${CHECK_LIBRARIES})
+    CHECK_C_SOURCE_COMPILES(
+      "#include <check.h>
+       int main() {
+         ck_assert_ptr_ne(NULL, NULL);
+       }"
+      CHECK_FUNCS)
+    if (NOT CHECK_FUNCS)
+      set(CHECK_FOUND 0)
+    endif()
+  endif()
 endif()
 
 if(CHECK_FOUND)
 enable_testing()
 
+include_directories("${CMAKE_SOURCE_DIR}/src")
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
 include_directories(${CHECK_INCLUDE_DIRS})
 link_directories(${CHECK_LIBRARY_DIRS})
 
@@ -29,9 +62,6 @@ foreach(flag ${CHECK_LDFLAGS_OTHER})
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}")
 endforeach()
 
-include_directories("${CMAKE_SOURCE_DIR}/src")
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
-
 add_executable(check_types main.c check_types.c)
 target_link_libraries(check_types ${CHECK_LIBRARIES} gpuarray)
 add_test(test_types "${CMAKE_CURRENT_BINARY_DIR}/check_types")
@@ -40,10 +70,22 @@ add_executable(check_util main.c check_util.c)
 target_link_libraries(check_util ${CHECK_LIBRARIES} gpuarray)
 add_test(test_util "${CMAKE_CURRENT_BINARY_DIR}/check_util")
 
+add_executable(check_util_integerfactoring main.c check_util_integerfactoring.c)
+target_link_libraries(check_util_integerfactoring ${CHECK_LIBRARIES} gpuarray-static)
+add_test(test_util_integerfactoring "${CMAKE_CURRENT_BINARY_DIR}/check_util_integerfactoring")
+
+add_executable(check_reduction main.c device.c check_reduction.c)
+target_link_libraries(check_reduction ${CHECK_LIBRARIES} gpuarray)
+add_test(test_reduction "${CMAKE_CURRENT_BINARY_DIR}/check_reduction")
+
 add_executable(check_array main.c device.c check_array.c)
 target_link_libraries(check_array ${CHECK_LIBRARIES} gpuarray)
 add_test(test_array "${CMAKE_CURRENT_BINARY_DIR}/check_array")
 
+add_executable(check_blas main.c device.c check_blas.c)
+target_link_libraries(check_blas ${CHECK_LIBRARIES} gpuarray)
+add_test(test_blas "${CMAKE_CURRENT_BINARY_DIR}/check_blas")
+
 add_executable(check_elemwise main.c device.c check_elemwise.c)
 target_link_libraries(check_elemwise ${CHECK_LIBRARIES} gpuarray)
 add_test(test_elemwise "${CMAKE_CURRENT_BINARY_DIR}/check_elemwise")
@@ -56,64 +98,60 @@ add_executable(check_buffer main.c device.c check_buffer.c)
 target_link_libraries(check_buffer ${CHECK_LIBRARIES} gpuarray)
 add_test(test_buffer "${CMAKE_CURRENT_BINARY_DIR}/check_buffer")
 
-if(BUILD_WITH_COLLECTIVES)
-
-  find_package(MPI)
-
-  if(MPI_C_FOUND)
-
-    add_executable(check_buffer_collectives
-      main.c device.c communicator.c check_buffer_collectives.c
-      )
-    target_link_libraries(check_buffer_collectives
-      ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray
-      )
-    target_include_directories(check_buffer_collectives
-      PRIVATE ${MPI_C_INCLUDE_PATH}
-      )
-
-    add_executable(check_collectives
-      main.c device.c communicator.c check_collectives.c
-      )
-    target_link_libraries(check_collectives
-      ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray
-      )
-    target_include_directories(check_collectives
-      PRIVATE ${MPI_C_INCLUDE_PATH}
-      )
-
-    set_target_properties(check_buffer_collectives check_collectives PROPERTIES
-      COMPILE_DEFINITIONS TEST_COLLECTIVES
-      COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}"
-      LINK_FLAGS "${MPI_C_LINK_FLAGS}"
-      )
-
-    set(_NUM_DEVS $ENV{NUM_DEVS})
-    if(NOT _NUM_DEVS)
-      set(_NUM_DEVS 1)
-    endif()
-
-    set(_DEV_NAMES $ENV{DEV_NAMES})
-    if(NOT _DEV_NAMES)
-      set(_DEV_NAMES "cuda")
-    endif()
-    separate_arguments(_DEV_NAMES)
+find_package(MPI)
+
+if (MPI_C_FOUND)
+
+  add_executable(check_buffer_collectives
+    main.c device.c communicator.c check_buffer_collectives.c
+    )
+  target_link_libraries(check_buffer_collectives
+    ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray
+    )
+  target_include_directories(check_buffer_collectives
+    PRIVATE ${MPI_C_INCLUDE_PATH}
+    )
+
+  add_executable(check_collectives
+    main.c device.c communicator.c check_collectives.c
+    )
+  target_link_libraries(check_collectives
+    ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray
+    )
+  target_include_directories(check_collectives
+    PRIVATE ${MPI_C_INCLUDE_PATH}
+    )
+
+  set_target_properties(check_buffer_collectives check_collectives PROPERTIES
+    COMPILE_DEFINITIONS TEST_COLLECTIVES
+    COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}"
+    LINK_FLAGS "${MPI_C_LINK_FLAGS}"
+    )
+
+  set(_NUM_DEVS $ENV{NUM_DEVS})
+  if(NOT _NUM_DEVS)
+    set(_NUM_DEVS 1)
+  endif()
 
-    add_test(NAME test_buffer_collectives
-      COMMAND "${MPIEXEC}" ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS}
-      "${CMAKE_CURRENT_BINARY_DIR}/check_buffer_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES})
-    add_test(NAME test_collectives
-      COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS}
-      "${CMAKE_CURRENT_BINARY_DIR}/check_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES})
+  set(_DEV_NAMES $ENV{DEV_NAMES})
+  if(NOT _DEV_NAMES)
+    set(_DEV_NAMES "cuda")
+  endif()
+  separate_arguments(_DEV_NAMES)
 
-  else(MPI_C_FOUND)
+  add_test(NAME test_buffer_collectives
+    COMMAND "${MPIEXEC}" ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS}
+    "${CMAKE_CURRENT_BINARY_DIR}/check_buffer_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES})
+  add_test(NAME test_collectives
+    COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS}
+    "${CMAKE_CURRENT_BINARY_DIR}/check_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES})
 
-    message(WARNING "Cannot find MPI")
-    message(WARNING "Checks on collectives and buffer_collectives will not be built or performed.")
+else()
 
-  endif(MPI_C_FOUND)
+  message(WARNING "Cannot find MPI")
+  message(WARNING "Checks on collectives and buffer_collectives will not be built or performed.")
 
-endif(BUILD_WITH_COLLECTIVES)
+endif()
 
 ELSE(CHECK_FOUND)
 
diff --git a/tests/check_array.c b/tests/check_array.c
index 721c156df9..1eeb7ae4c4 100644
--- a/tests/check_array.c
+++ b/tests/check_array.c
@@ -21,38 +21,41 @@ START_TEST(test_take1_ok) {
   GpuArray v;
   GpuArray vidx;
   GpuArray vres;
-  static const uint32_t data[24] = { 0,  1,  2,  3,  4,  5,
-                                     6,  7,  8,  9, 10, 11,
-                                    12, 13, 14, 15, 16, 17,
-                                    18, 19, 20, 21, 22, 23};
+  const uint32_t data[24] = { 0,  1,  2,  3,  4,  5,
+                              6,  7,  8,  9, 10, 11,
+                              12, 13, 14, 15, 16, 17,
+                              18, 19, 20, 21, 22, 23};
   uint32_t buf[12 * 24];
-  static const size_t data_dims[1] = {24};
-  ssize_t indexes[12];
+  const size_t data_dims[1] = {24};
+  long indexes[12];
   size_t dims[3];
 
   ga_assert_ok(GpuArray_empty(&base, ctx, GA_UINT, 1, data_dims, GA_C_ORDER));
   ga_assert_ok(GpuArray_write(&base, data, sizeof(data)));
   dims[0] = 12;
-  ga_assert_ok(GpuArray_empty(&idx, ctx, GA_SSIZE, 1, dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_empty(&idx, ctx, GA_LONG, 1, dims, GA_C_ORDER));
   dims[1] = 6;
   ga_assert_ok(GpuArray_empty(&res, ctx, GA_UINT, 2, dims, GA_C_ORDER));
 
   /* test v[[1, 0]] on 1d (4) */
   indexes[0] = 1;
   indexes[1] = 0;
-  ga_assert_ok(GpuArray_write(&idx, indexes, sizeof(ssize_t) * 2));
+  ga_assert_ok(GpuArray_write(&idx, indexes, sizeof(long) * 2));
 
   ga_assert_ok(GpuArray_view(&v, &base));
   ga_assert_ok(GpuArray_view(&vidx, &idx));
   ga_assert_ok(GpuArray_view(&vres, &res));
 
   v.dimensions[0] = 4;
+  GpuArray_fix_flags(&v);
 
   vidx.dimensions[0] = 2;
+  GpuArray_fix_flags(&vidx);
 
   vres.nd = 1;
   vres.dimensions[0] = vidx.dimensions[0];
   vres.strides[0] = v.strides[0];
+  GpuArray_fix_flags(&vres);
 
   ga_assert_ok(GpuArray_take1(&vres, &v, &vidx, 0));
   ga_assert_ok(GpuArray_read(buf, sizeof(uint32_t) * 2, &vres));
@@ -75,18 +78,21 @@ START_TEST(test_take1_ok) {
   ga_assert_ok(GpuArray_view(&vres, &res));
 
   vidx.dimensions[0] = 3;
+  GpuArray_fix_flags(&vidx);
 
   dims[0] = 4;
   dims[1] = 6;
   ga_assert_ok(GpuArray_reshape_inplace(&v, 2, dims, GA_ANY_ORDER));
   v.dimensions[1] = 5;
   v.strides[0] = v.dimensions[1] * v.strides[1];
+  GpuArray_fix_flags(&v);
 
   dims[0] = 3;
   dims[1] = 24;
   ga_assert_ok(GpuArray_reshape_inplace(&vres, 2, dims, GA_C_ORDER));
   vres.dimensions[1] = v.dimensions[1];
   vres.strides[0] = v.strides[0];
+  GpuArray_fix_flags(&vres);
 
   ga_assert_ok(GpuArray_take1(&vres, &v, &vidx, 0));
   ga_assert_ok(GpuArray_read(buf, sizeof(uint32_t) * 15, &vres));
@@ -243,12 +249,56 @@ START_TEST(test_take1_ok) {
 }
 END_TEST
 
+START_TEST(test_take1_offset) {
+  const uint32_t data[4] = {0, 1, 2, 3};
+  const size_t data_dims[1] = {4};
+  const size_t out_dims[1] = {2};
+  const uint32_t idx[4] = {20, 3, 3, 2};
+  GpuArray v;
+  GpuArray i;
+  GpuArray r;
+
+  ga_assert_ok(GpuArray_empty(&v, ctx, GA_UINT, 1, data_dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_write(&v, data, sizeof(data)));
+
+  ga_assert_ok(GpuArray_empty(&i, ctx, GA_UINT, 1, data_dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_write(&i, idx, sizeof(idx)));
+
+  ga_assert_ok(GpuArray_empty(&r, ctx, GA_UINT, 1, out_dims, GA_C_ORDER));
+
+  /* Fake subtensor for offset */
+  i.offset += 8;
+  i.dimensions[0] = 2;
+  GpuArray_fix_flags(&i);
+
+  ga_assert_ok(GpuArray_take1(&r, &v, &i, 1));
+  /* The actual results are not important, this is just to check that
+     we don't trigger the out of bounds check */
+}
+END_TEST
+
+START_TEST(test_reshape_0) {
+  /* This tests that we don't segfault when reshaping 0-sized arrays */
+  const size_t odims[3] = {24, 0, 33};
+  const size_t ndims1[3] = {0, 24, 33};
+  const size_t ndims2[3] = {24, 33, 0};
+
+  GpuArray v;
+  ga_assert_ok(GpuArray_empty(&v, ctx, GA_FLOAT, 3, odims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_reshape_inplace(&v, 3, ndims1, GA_ANY_ORDER));
+  ga_assert_ok(GpuArray_reshape_inplace(&v, 3, odims, GA_ANY_ORDER));
+  ga_assert_ok(GpuArray_reshape_inplace(&v, 3, ndims2, GA_ANY_ORDER));
+}
+END_TEST
+
 Suite *get_suite(void) {
   Suite *s = suite_create("array");
   TCase *tc = tcase_create("take1");
   tcase_add_checked_fixture(tc, setup, teardown);
   tcase_set_timeout(tc, 8.0);
   tcase_add_test(tc, test_take1_ok);
+  tcase_add_test(tc, test_take1_offset);
+  tcase_add_test(tc, test_reshape_0);
   suite_add_tcase(s, tc);
   return s;
 }
diff --git a/tests/check_blas.c b/tests/check_blas.c
new file mode 100644
index 0000000000..fca6b9f0f2
--- /dev/null
+++ b/tests/check_blas.c
@@ -0,0 +1,124 @@
+#include <stdlib.h>
+
+#include <check.h>
+
+#include "gpuarray/array.h"
+#include "gpuarray/blas.h"
+#include "gpuarray/error.h"
+#include "gpuarray/types.h"
+
+extern void *ctx;
+
+void setup(void);
+void teardown(void);
+
+#define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR)
+
+static inline void ck_assert_fbuf_eq(const float *b, const float *r,
+                                     unsigned int n) {
+  unsigned int i;
+  for (i = 0; i < n; i++) {
+    ck_assert_msg(b[i] == r[i], "Difference at %u: %f != %f(ref)", i, b[i], r[i]);
+  }
+}
+
+START_TEST(test_gemmBatch_3d_C) {
+  GpuArray A;
+  GpuArray B;
+  GpuArray C;
+
+  size_t dims[3] = {2, 3, 3};
+  float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9};
+  const float res[] = {30, 36, 42, 66, 81, 96, 102, 126, 150,
+                       30, 36, 42, 66, 81, 96, 102, 126, 150};
+
+  ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER));
+
+  ga_assert_ok(GpuArray_write(&A, data, sizeof(data)));
+  ga_assert_ok(GpuArray_write(&B, data, sizeof(data)));
+
+  ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 1));
+
+  ga_assert_ok(GpuArray_read(data, sizeof(data), &C));
+
+  ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float));
+}
+END_TEST
+
+START_TEST(test_gemmBatch_3d_F) {
+  GpuArray A;
+  GpuArray B;
+  GpuArray C;
+
+  size_t dims[3] = {2, 3, 3};
+  float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9};
+  const float res[] = {42, 78, 78, 60, 114, 114, 51, 69, 96,
+                       66, 39, 111, 54, 54, 90, 78, 78, 132};
+
+  ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_F_ORDER));
+  ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_F_ORDER));
+  ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER));
+
+  ga_assert_ok(GpuArray_write(&A, data, sizeof(data)));
+  ga_assert_ok(GpuArray_write(&B, data, sizeof(data)));
+
+  ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 0));
+
+  ga_assert_ok(GpuArray_read(data, sizeof(data), &C));
+
+  ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float));
+}
+END_TEST
+
+START_TEST(test_gemmBatch_3d_S) {
+  GpuArray A;
+  GpuArray B;
+  GpuArray C;
+  ssize_t t;
+
+  size_t dims[3] = {2, 3, 3};
+  float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9,
+                  1, 2, 3, 4, 5, 6, 7, 8, 9};
+  const float res[] = {14, 32, 50, 50, 122, 194, 32, 77, 122,
+                       26, 62, 98, 17, 53, 89, 44, 107, 170};
+
+  ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_F_ORDER));
+  ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER));
+
+  ga_assert_ok(GpuArray_write(&A, data, sizeof(data)));
+  ga_assert_ok(GpuArray_write(&B, data, sizeof(data)));
+
+  A.strides[0] = 8;
+  A.strides[1] = 24;
+  A.strides[2] = 4;
+  GpuArray_fix_flags(&A);
+
+  t = B.strides[1];
+  B.strides[1] = B.strides[2];
+  B.strides[2] = t;
+  GpuArray_fix_flags(&B);
+
+  ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 1));
+
+  ga_assert_ok(GpuArray_read(data, sizeof(data), &C));
+
+  ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float));
+}
+END_TEST
+
+Suite *get_suite(void) {
+  Suite *s = suite_create("blas");
+  TCase *tc = tcase_create("all");
+  tcase_add_checked_fixture(tc, setup, teardown);
+  tcase_set_timeout(tc, 16.0);
+  tcase_add_test(tc, test_gemmBatch_3d_C);
+  tcase_add_test(tc, test_gemmBatch_3d_F);
+  tcase_add_test(tc, test_gemmBatch_3d_S);
+  suite_add_tcase(s, tc);
+  return s;
+}
diff --git a/tests/check_buffer.c b/tests/check_buffer.c
index b091a718d6..859b1f0568 100644
--- a/tests/check_buffer.c
+++ b/tests/check_buffer.c
@@ -10,15 +10,6 @@ extern void *ctx;
 void setup(void);
 void teardown(void);
 
-START_TEST(test_gpu_error) {
-  const char *msg;
-  msg = gpucontext_error(NULL, -1);
-  msg = gpucontext_error(NULL, 99);
-  msg = gpucontext_error(NULL, GA_NO_ERROR);
-  ck_assert_str_eq(msg, "No error");
-}
-END_TEST
-
 static unsigned int refcnt(gpudata *b) {
   unsigned int res;
   int err;
@@ -189,7 +180,6 @@ Suite *get_suite(void) {
   Suite *s = suite_create("buffer");
   TCase *tc = tcase_create("API");
   tcase_add_checked_fixture(tc, setup, teardown);
-  tcase_add_test(tc, test_gpu_error);
   tcase_add_test(tc, test_buffer_alloc);
   tcase_add_test(tc, test_buffer_retain_release);
   tcase_add_test(tc, test_buffer_share);
diff --git a/tests/check_buffer_collectives.c b/tests/check_buffer_collectives.c
index 806e2e724e..135b8a76c0 100644
--- a/tests/check_buffer_collectives.c
+++ b/tests/check_buffer_collectives.c
@@ -28,9 +28,9 @@ extern void teardown_comm(void);
 #define ABS_DIFF(a, b) fabs((double)(b - a))
 #define MAX_ABS_DIFF(A, B, N, res)           \
   do {                                       \
-    res = 0;                                 \
     double locdelta;                         \
     int loci;                                \
+    res = 0;                                 \
     for (loci = 0; loci < N; ++loci) {       \
       locdelta = ABS_DIFF(A[loci], B[loci]); \
       if (locdelta > res)                    \
@@ -42,8 +42,8 @@ typedef unsigned long ulong;
 
 #define PRINTV(ar, N, t)           \
   do {                             \
-    printf("%s\n", STR(ar));       \
     int li;                        \
+    printf("%s\n", STR(ar));       \
     for (li = 0; li < (N); ++li) { \
       printf(STR(t) " ", ar[li]);  \
     }                              \
@@ -81,18 +81,21 @@ END_TEST
 
 #define INIT_ARRAYS(insize, outsize)                              \
   int err;                                                        \
-  void* Av = calloc((insize), sizeof(char));                      \
+  void* Av, * RESv, * EXPv;                                       \
+  gpudata* Adev, *RESdev;                                         \
+                                                                  \
+  Av = calloc((insize), sizeof(char));                            \
   if (Av == NULL)                                                 \
     ck_abort_msg("system memory allocation failed");              \
-  void* RESv = calloc((outsize), sizeof(char));                   \
+  RESv = calloc((outsize), sizeof(char));                         \
   if (RESv == NULL)                                               \
     ck_abort_msg("system memory allocation failed");              \
-  void* EXPv = calloc((outsize), sizeof(char));                   \
+  EXPv = calloc((outsize), sizeof(char));                         \
   if (EXPv == NULL)                                               \
     ck_abort_msg("system memory allocation failed");              \
-  gpudata* Adev = gpudata_alloc(ctx, (insize), NULL, 0, &err);    \
+  Adev = gpudata_alloc(ctx, (insize), NULL, 0, &err);             \
   ck_assert_ptr_ne(Adev, NULL);                                   \
-  gpudata* RESdev = gpudata_alloc(ctx, (outsize), NULL, 0, &err); \
+  RESdev = gpudata_alloc(ctx, (outsize), NULL, 0, &err);          \
   ck_assert_ptr_ne(RESdev, NULL);
 
 #define DESTROY_ARRAYS() \
@@ -104,13 +107,15 @@ END_TEST
 
 #define TEST_REDUCE(systype, gatype, mpitype, coloptype, epsilon, print)       \
   START_TEST(test_gpucomm_reduce_##gatype##_##coloptype) {                     \
+    systype* A, * RES, * EXP;                                                  \
+    int i, count;                                                              \
     INIT_ARRAYS(SIZE, SIZE)                                                    \
                                                                                \
-    systype* A = (systype*)Av;                                                 \
-    systype* RES = (systype*)RESv;                                             \
-    systype* EXP = (systype*)EXPv;                                             \
+    A = (systype*)Av;                                                          \
+    RES = (systype*)RESv;                                                      \
+    EXP = (systype*)EXPv;                                                      \
                                                                                \
-    int i, count = SIZE / sizeof(systype);                                     \
+    count = SIZE / sizeof(systype);                                            \
     for (i = 0; i < count; ++i)                                                \
       A[i] = comm_rank + 2;                                                    \
     err = gpudata_write(Adev, 0, A, SIZE);                                     \
@@ -128,9 +133,9 @@ END_TEST
                   "openmpi error: cannot produced expected");                  \
                                                                                \
     if (comm_rank == ROOT_RANK) {                                              \
+      systype res;                                                             \
       err = gpudata_read(RES, RESdev, 0, SIZE);                                \
       ck_assert_int_eq(err, GA_NO_ERROR);                                      \
-      systype res;                                                             \
       MAX_ABS_DIFF(RES, EXP, count, res);                                      \
       if (!(res <= epsilon)) {                                                 \
         print(RES, count);                                                     \
@@ -166,10 +171,6 @@ TEST_REDUCE(int, INT, INT, SUM, 0, PRINTVI)
 TEST_REDUCE(int, INT, INT, PROD, 0, PRINTVI)
 TEST_REDUCE(int, INT, INT, MAX, 0, PRINTVI)
 TEST_REDUCE(int, INT, INT, MIN, 0, PRINTVI)
-TEST_REDUCE(char, BYTE, BYTE, SUM, 0, PRINTVI)
-TEST_REDUCE(char, BYTE, BYTE, PROD, 0, PRINTVI)
-TEST_REDUCE(char, BYTE, BYTE, MAX, 0, PRINTVI)
-TEST_REDUCE(char, BYTE, BYTE, MIN, 0, PRINTVI)
 TEST_REDUCE(float, FLOAT, FLOAT, SUM, EPS, PRINTVF)
 TEST_REDUCE(float, FLOAT, FLOAT, PROD, EPS, PRINTVF)
 TEST_REDUCE(float, FLOAT, FLOAT, MAX, EPS, PRINTVF)
@@ -193,17 +194,20 @@ TEST_REDUCE_FAIL(optype, SIZE / sizeof(int), GA_INT, -1, 0, GA_INVALID_ERROR)
 TEST_REDUCE_FAIL(src_offset, SIZE / sizeof(int), GA_INT, GA_SUM,
                  SIZE - sizeof(int), GA_VALUE_ERROR)
 TEST_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0,
-                 GA_UNSUPPORTED_ERROR)
+                 GA_XLARGE_ERROR)
 
 #define TEST_ALL_REDUCE(systype, gatype, mpitype, coloptype, epsilon, print) \
   START_TEST(test_gpucomm_all_reduce_##gatype##_##coloptype) {               \
+    systype* A, * RES, * EXP;                                                \
+    systype res;                                                             \
+    int i, count;                                                            \
     INIT_ARRAYS(SIZE, SIZE)                                                  \
                                                                              \
-    systype* A = (systype*)Av;                                               \
-    systype* RES = (systype*)RESv;                                           \
-    systype* EXP = (systype*)EXPv;                                           \
+    A = (systype*)Av;                                                        \
+    RES = (systype*)RESv;                                                    \
+    EXP = (systype*)EXPv;                                                    \
                                                                              \
-    int i, count = SIZE / sizeof(systype);                                   \
+    count = SIZE / sizeof(systype);                                          \
     for (i = 0; i < count; ++i)                                              \
       A[i] = comm_rank + 2;                                                  \
     err = gpudata_write(Adev, 0, A, SIZE);                                   \
@@ -222,7 +226,6 @@ TEST_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0,
                                                                              \
     err = gpudata_read(RES, RESdev, 0, SIZE);                                \
     ck_assert_int_eq(err, GA_NO_ERROR);                                      \
-    systype res;                                                             \
     MAX_ABS_DIFF(RES, EXP, count, res);                                      \
     if (!(res <= epsilon)) {                                                 \
       print(RES, count);                                                     \
@@ -258,10 +261,6 @@ TEST_ALL_REDUCE(int, INT, INT, SUM, 0, PRINTVI)
 TEST_ALL_REDUCE(int, INT, INT, PROD, 0, PRINTVI)
 TEST_ALL_REDUCE(int, INT, INT, MAX, 0, PRINTVI)
 TEST_ALL_REDUCE(int, INT, INT, MIN, 0, PRINTVI)
-TEST_ALL_REDUCE(char, BYTE, BYTE, SUM, 0, PRINTVI)
-TEST_ALL_REDUCE(char, BYTE, BYTE, PROD, 0, PRINTVI)
-TEST_ALL_REDUCE(char, BYTE, BYTE, MAX, 0, PRINTVI)
-TEST_ALL_REDUCE(char, BYTE, BYTE, MIN, 0, PRINTVI)
 TEST_ALL_REDUCE(float, FLOAT, FLOAT, SUM, EPS, PRINTVF)
 TEST_ALL_REDUCE(float, FLOAT, FLOAT, PROD, EPS, PRINTVF)
 TEST_ALL_REDUCE(float, FLOAT, FLOAT, MAX, EPS, PRINTVF)
@@ -289,31 +288,36 @@ TEST_ALL_REDUCE_FAIL(src_offset, SIZE / sizeof(int), GA_INT, GA_SUM,
 TEST_ALL_REDUCE_FAIL(dest_offset, SIZE / sizeof(int), GA_INT, GA_SUM, 0,
                      SIZE - sizeof(int), GA_VALUE_ERROR)
 TEST_ALL_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0,
-                     GA_UNSUPPORTED_ERROR)
+                     GA_XLARGE_ERROR)
 
 #define TEST_REDUCE_SCATTER(systype, gatype, mpitype, coloptype, epsilon,    \
                             print)                                           \
   START_TEST(test_gpucomm_reduce_scatter_##gatype##_##coloptype) {           \
+    systype* A, * RES, * EXP;                                                \
+    systype res;                                                             \
+    int i, count;                                                            \
+    int recvcount;                                                           \
+    int* recvcounts;                                                         \
     INIT_ARRAYS(SIZE, SIZE / comm_ndev)                                      \
                                                                              \
-    systype* A = (systype*)Av;                                               \
-    systype* RES = (systype*)RESv;                                           \
-    systype* EXP = (systype*)EXPv;                                           \
+    A = (systype*)Av;                                                        \
+    RES = (systype*)RESv;                                                    \
+    EXP = (systype*)EXPv;                                                    \
                                                                              \
-    int i, count = SIZE / sizeof(systype);                                   \
+    count = SIZE / sizeof(systype);                                          \
     for (i = 0; i < count; ++i)                                              \
       A[i] = comm_rank + 2;                                                  \
     err = gpudata_write(Adev, 0, A, SIZE);                                   \
     ck_assert_int_eq(err, GA_NO_ERROR);                                      \
                                                                              \
-    int recvcount = count / comm_ndev;                                       \
+    recvcount = count / comm_ndev;                                           \
     err = gpucomm_reduce_scatter(Adev, 0, RESdev, 0, recvcount, GA_##gatype, \
                                  GA_##coloptype, comm);                      \
     ck_assert_int_eq(err, GA_NO_ERROR);                                      \
     gpudata_sync(RESdev);                                                    \
     gpudata_sync(Adev);                                                      \
                                                                              \
-    int* recvcounts = (int*)malloc(comm_ndev * sizeof(int));                 \
+    recvcounts = (int*)malloc(comm_ndev * sizeof(int));                      \
     if (recvcounts == NULL)                                                  \
       ck_abort_msg("system memory allocation failed");                       \
     for (i = 0; i < comm_ndev; ++i)                                          \
@@ -326,7 +330,6 @@ TEST_ALL_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0,
                                                                              \
     err = gpudata_read(RES, RESdev, 0, SIZE / comm_ndev);                    \
     ck_assert_int_eq(err, GA_NO_ERROR);                                      \
-    systype res;                                                             \
     MAX_ABS_DIFF(RES, EXP, recvcount, res);                                  \
     if (!(res <= epsilon)) {                                                 \
       print(RES, recvcount);                                                 \
@@ -362,10 +365,6 @@ TEST_REDUCE_SCATTER(int, INT, INT, SUM, 0, PRINTVI)
 TEST_REDUCE_SCATTER(int, INT, INT, PROD, 0, PRINTVI)
 TEST_REDUCE_SCATTER(int, INT, INT, MAX, 0, PRINTVI)
 TEST_REDUCE_SCATTER(int, INT, INT, MIN, 0, PRINTVI)
-TEST_REDUCE_SCATTER(char, BYTE, BYTE, SUM, 0, PRINTVI)
-TEST_REDUCE_SCATTER(char, BYTE, BYTE, PROD, 0, PRINTVI)
-TEST_REDUCE_SCATTER(char, BYTE, BYTE, MAX, 0, PRINTVI)
-TEST_REDUCE_SCATTER(char, BYTE, BYTE, MIN, 0, PRINTVI)
 TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, SUM, EPS, PRINTVF)
 TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, PROD, EPS, PRINTVF)
 TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, MAX, EPS, PRINTVF)
@@ -392,16 +391,19 @@ TEST_REDUCE_SCATTER_FAIL(src_offset, outcount, GA_INT, GA_SUM,
 TEST_REDUCE_SCATTER_FAIL(dest_offset, outcount, GA_INT, GA_SUM, 0,
                          SIZE / comm_ndev - sizeof(int), GA_VALUE_ERROR)
 TEST_REDUCE_SCATTER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0,
-                         GA_UNSUPPORTED_ERROR)
+                         GA_XLARGE_ERROR)
 
 #define TEST_BROADCAST(systype, gatype, mpitype, epsilon, print)             \
   START_TEST(test_gpucomm_broadcast_##gatype) {                              \
+    systype* RES, * EXP;                                                     \
+    systype res;                                                             \
+    int i, count;                                                            \
     INIT_ARRAYS(SIZE, SIZE)                                                  \
                                                                              \
-    systype* RES = (systype*)RESv;                                           \
-    systype* EXP = (systype*)EXPv;                                           \
+    RES = (systype*)RESv;                                                    \
+    EXP = (systype*)EXPv;                                                    \
                                                                              \
-    int i, count = SIZE / sizeof(systype);                                   \
+    count = SIZE / sizeof(systype);                                          \
     for (i = 0; i < count; ++i) {                                            \
       RES[i] = comm_rank + 1;                                                \
       EXP[i] = RES[i];                                                       \
@@ -419,7 +421,6 @@ TEST_REDUCE_SCATTER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0,
                                                                              \
     err = gpudata_read(RES, RESdev, 0, SIZE);                                \
     ck_assert_int_eq(err, GA_NO_ERROR);                                      \
-    systype res;                                                             \
     MAX_ABS_DIFF(RES, EXP, count, res);                                      \
     if (!(res <= epsilon)) {                                                 \
       print(RES, count);                                                     \
@@ -459,18 +460,21 @@ TEST_BROADCAST_FAIL(datatype, SIZE / sizeof(int), -1, 0, GA_INVALID_ERROR)
 TEST_BROADCAST_FAIL(src_offset, SIZE / sizeof(int), GA_INT, SIZE - sizeof(int),
                     GA_VALUE_ERROR)
 TEST_BROADCAST_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0,
-                    GA_UNSUPPORTED_ERROR)
+                    GA_XLARGE_ERROR)
 
 #define TEST_ALL_GATHER(systype, gatype, mpitype, epsilon, print)             \
   START_TEST(test_gpucomm_all_gather_##gatype) {                              \
+    systype* A, * RES, * EXP;                                                 \
+    systype res;                                                              \
+    int i, count, sendcount;                                                  \
     INIT_ARRAYS(SIZE / comm_ndev, SIZE)                                       \
                                                                               \
-    systype* A = (systype*)Av;                                                \
-    systype* RES = (systype*)RESv;                                            \
-    systype* EXP = (systype*)EXPv;                                            \
+    A = (systype*)Av;                                                         \
+    RES = (systype*)RESv;                                                     \
+    EXP = (systype*)EXPv;                                                     \
                                                                               \
-    int i, count = SIZE / sizeof(systype);                                    \
-    int sendcount = count / comm_ndev;                                        \
+    count = SIZE / sizeof(systype);                                           \
+    sendcount = count / comm_ndev;                                            \
     for (i = 0; i < sendcount; ++i)                                           \
       A[i] = comm_rank + 1;                                                   \
     err = gpudata_write(Adev, 0, A, SIZE / comm_ndev);                        \
@@ -489,7 +493,6 @@ TEST_BROADCAST_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0,
                                                                               \
     err = gpudata_read(RES, RESdev, 0, SIZE);                                 \
     ck_assert_int_eq(err, GA_NO_ERROR);                                       \
-    systype res;                                                              \
     MAX_ABS_DIFF(RES, EXP, count, res);                                       \
     if (!(res <= epsilon)) {                                                  \
       print(RES, count);                                                      \
@@ -533,26 +536,35 @@ TEST_ALL_GATHER_FAIL(src_offset, incount, GA_INT,
 TEST_ALL_GATHER_FAIL(dest_offset, incount, GA_INT, 0, SIZE - sizeof(int),
                      GA_VALUE_ERROR)
 TEST_ALL_GATHER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, 0,
-                     GA_UNSUPPORTED_ERROR)
+                     GA_XLARGE_ERROR)
 
 Suite* get_suite(void) {
-  Suite* s = suite_create("buffer_collectives_API");
-
-  TCase* helps = tcase_create("test_helpers");
+  Suite* s;
+  TCase* helps;
+  TCase* reds;
+  TCase* redf;
+  TCase* areds;
+  TCase* aredf;
+  TCase* redscs;
+  TCase* redscf;
+  TCase* bcasts;
+  TCase* bcastf;
+  TCase* agats;
+  TCase* agatf;
+
+  s = suite_create("buffer_collectives_API");
+
+  helps = tcase_create("test_helpers");
   tcase_add_unchecked_fixture(helps, setup_comm, teardown_comm);
   tcase_add_test(helps, test_gpucomm_get_count);
   tcase_add_test(helps, test_gpucomm_get_rank);
 
-  TCase* reds = tcase_create("test_reduce");
+  reds = tcase_create("test_reduce");
   tcase_add_unchecked_fixture(reds, setup_comm, teardown_comm);
   tcase_add_test(reds, test_gpucomm_reduce_INT_SUM);
   tcase_add_test(reds, test_gpucomm_reduce_INT_PROD);
   tcase_add_test(reds, test_gpucomm_reduce_INT_MAX);
   tcase_add_test(reds, test_gpucomm_reduce_INT_MIN);
-  tcase_add_test(reds, test_gpucomm_reduce_BYTE_SUM);
-  tcase_add_test(reds, test_gpucomm_reduce_BYTE_PROD);
-  tcase_add_test(reds, test_gpucomm_reduce_BYTE_MAX);
-  tcase_add_test(reds, test_gpucomm_reduce_BYTE_MIN);
   tcase_add_test(reds, test_gpucomm_reduce_FLOAT_SUM);
   tcase_add_test(reds, test_gpucomm_reduce_FLOAT_PROD);
   tcase_add_test(reds, test_gpucomm_reduce_FLOAT_MAX);
@@ -570,23 +582,19 @@ Suite* get_suite(void) {
   tcase_add_test(reds, test_gpucomm_reduce_ULONG_MAX);
   tcase_add_test(reds, test_gpucomm_reduce_ULONG_MIN);
 
-  TCase* redf = tcase_create("test_reduce_fail");
+  redf = tcase_create("test_reduce_fail");
   tcase_add_unchecked_fixture(redf, setup_comm, teardown_comm);
   tcase_add_test(redf, test_gpucomm_reduce_fail_datatype);
   tcase_add_test(redf, test_gpucomm_reduce_fail_optype);
   tcase_add_test(redf, test_gpucomm_reduce_fail_src_offset);
   tcase_add_test(redf, test_gpucomm_reduce_fail_elemcount);
 
-  TCase* areds = tcase_create("test_all_reduce");
+  areds = tcase_create("test_all_reduce");
   tcase_add_unchecked_fixture(areds, setup_comm, teardown_comm);
   tcase_add_test(areds, test_gpucomm_all_reduce_INT_SUM);
   tcase_add_test(areds, test_gpucomm_all_reduce_INT_PROD);
   tcase_add_test(areds, test_gpucomm_all_reduce_INT_MAX);
   tcase_add_test(areds, test_gpucomm_all_reduce_INT_MIN);
-  tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_SUM);
-  tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_PROD);
-  tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_MAX);
-  tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_MIN);
   tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_SUM);
   tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_PROD);
   tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_MAX);
@@ -604,7 +612,7 @@ Suite* get_suite(void) {
   tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_MAX);
   tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_MIN);
 
-  TCase* aredf = tcase_create("test_all_reduce_fail");
+  aredf = tcase_create("test_all_reduce_fail");
   tcase_add_unchecked_fixture(aredf, setup_comm, teardown_comm);
   tcase_add_test(aredf, test_gpucomm_all_reduce_fail_datatype);
   tcase_add_test(aredf, test_gpucomm_all_reduce_fail_optype);
@@ -612,16 +620,12 @@ Suite* get_suite(void) {
   tcase_add_test(aredf, test_gpucomm_all_reduce_fail_dest_offset);
   tcase_add_test(aredf, test_gpucomm_all_reduce_fail_elemcount);
 
-  TCase* redscs = tcase_create("test_reduce_scatter");
+  redscs = tcase_create("test_reduce_scatter");
   tcase_add_unchecked_fixture(redscs, setup_comm, teardown_comm);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_SUM);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_PROD);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_MAX);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_MIN);
-  tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_SUM);
-  tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_PROD);
-  tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_MAX);
-  tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_MIN);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_SUM);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_PROD);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_MAX);
@@ -639,7 +643,7 @@ Suite* get_suite(void) {
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_MAX);
   tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_MIN);
 
-  TCase* redscf = tcase_create("test_reduce_scatter_fail");
+  redscf = tcase_create("test_reduce_scatter_fail");
   tcase_add_unchecked_fixture(redscf, setup_comm, teardown_comm);
   tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_datatype);
   tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_optype);
@@ -647,7 +651,7 @@ Suite* get_suite(void) {
   tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_dest_offset);
   tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_elemcount);
 
-  TCase* bcasts = tcase_create("test_broadcast");
+  bcasts = tcase_create("test_broadcast");
   tcase_add_unchecked_fixture(bcasts, setup_comm, teardown_comm);
   tcase_add_test(bcasts, test_gpucomm_broadcast_INT);
   tcase_add_test(bcasts, test_gpucomm_broadcast_BYTE);
@@ -656,13 +660,13 @@ Suite* get_suite(void) {
   tcase_add_test(bcasts, test_gpucomm_broadcast_LONG);
   tcase_add_test(bcasts, test_gpucomm_broadcast_ULONG);
 
-  TCase* bcastf = tcase_create("test_broadcast_fail");
+  bcastf = tcase_create("test_broadcast_fail");
   tcase_add_unchecked_fixture(bcastf, setup_comm, teardown_comm);
   tcase_add_test(bcastf, test_gpucomm_broadcast_fail_datatype);
   tcase_add_test(bcastf, test_gpucomm_broadcast_fail_src_offset);
   tcase_add_test(bcastf, test_gpucomm_broadcast_fail_elemcount);
 
-  TCase* agats = tcase_create("test_all_gather");
+  agats = tcase_create("test_all_gather");
   tcase_add_unchecked_fixture(agats, setup_comm, teardown_comm);
   tcase_add_test(agats, test_gpucomm_all_gather_INT);
   tcase_add_test(agats, test_gpucomm_all_gather_BYTE);
@@ -671,7 +675,7 @@ Suite* get_suite(void) {
   tcase_add_test(agats, test_gpucomm_all_gather_LONG);
   tcase_add_test(agats, test_gpucomm_all_gather_ULONG);
 
-  TCase* agatf = tcase_create("test_all_gather_fail");
+  agatf = tcase_create("test_all_gather_fail");
   tcase_add_unchecked_fixture(agatf, setup_comm, teardown_comm);
   tcase_add_test(agatf, test_gpucomm_all_gather_fail_datatype);
   tcase_add_test(agatf, test_gpucomm_all_gather_fail_src_offset);
diff --git a/tests/check_collectives.c b/tests/check_collectives.c
index 492f920d0f..4db0614915 100644
--- a/tests/check_collectives.c
+++ b/tests/check_collectives.c
@@ -29,8 +29,8 @@ extern void teardown_comm(void);
 #define _STR(x) #x
 #define COUNT_ERRORS(A, B, M, N, res)           \
   do {                                          \
-    res = 0;                                    \
     int loci, locj;                             \
+    res = 0;                                    \
     for (loci = 0; loci < (M); ++loci) {        \
       for (locj = 0; locj < (N); ++locj) {      \
         if ((A)[loci][locj] != (B)[loci][locj]) \
@@ -45,37 +45,39 @@ extern void teardown_comm(void);
 
 #define INIT_ARRAYS(inrows, incols, outrows, outcols)                        \
   int(*A)[(incols)];                                                         \
+  int(*RES)[(outcols)];                                                      \
+  int(*EXP)[(outcols)];                                                      \
+  size_t indims[ND];                                                         \
+  size_t outdims[ND];                                                        \
+  const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)};                  \
+  int err;                                                                   \
+  size_t i, j, outsize;                                                      \
+  GpuArray Adev;                                                             \
+  GpuArray RESdev;                                                           \
+                                                                             \
   A = (int(*)[(incols)])calloc((inrows), sizeof(*A));                        \
   if (A == NULL)                                                             \
     ck_abort_msg("system memory allocation failed");                         \
-  int(*RES)[(outcols)];                                                      \
   RES = (int(*)[(outcols)])calloc((outrows), sizeof(*RES));                  \
   if (RES == NULL)                                                           \
     ck_abort_msg("system memory allocation failed");                         \
-  int(*EXP)[(outcols)];                                                      \
   EXP = (int(*)[(outcols)])calloc((outrows), sizeof(*EXP));                  \
   if (EXP == NULL)                                                           \
     ck_abort_msg("system memory allocation failed");                         \
-  size_t indims[ND];                                                         \
   indims[0] = (inrows);                                                      \
   indims[1] = (incols);                                                      \
-  size_t outdims[ND];                                                        \
   outdims[0] = (outrows);                                                    \
   outdims[1] = (outcols);                                                    \
-  const ssize_t instrds[ND] = {sizeof(*A), sizeof(int)};                     \
-  const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)};                  \
-  size_t outsize = outdims[0] * outstrds[0];                                 \
+  outsize = outdims[0] * outstrds[0];                                        \
                                                                              \
-  size_t i, j;                                                               \
   for (i = 0; i < indims[0]; ++i)                                            \
     for (j = 0; j < indims[1]; ++j)                                          \
       A[i][j] = comm_rank + 2;                                               \
                                                                              \
-  int err;                                                                   \
-  GpuArray Adev;                                                             \
-  err = GpuArray_copy_from_host(&Adev, ctx, A, GA_INT, ND, indims, instrds); \
+  err = GpuArray_empty(&Adev, ctx, GA_INT, ND, indims, GA_C_ORDER);          \
+  ck_assert_int_eq(err, GA_NO_ERROR);                                        \
+  err = GpuArray_write(&Adev, A, sizeof(*A) * inrows);                       \
   ck_assert_int_eq(err, GA_NO_ERROR);                                        \
-  GpuArray RESdev;                                                           \
   err = GpuArray_empty(&RESdev, ctx, GA_INT, ND, outdims, GA_C_ORDER);       \
   ck_assert_int_eq(err, GA_NO_ERROR);
 
@@ -91,6 +93,7 @@ extern void teardown_comm(void);
  * aligned`.
  */
 START_TEST(test_GpuArray_reduce) {
+  int res;
   INIT_ARRAYS(ROWS, COLS, ROWS, COLS);
 
   if (comm_rank == ROOT_RANK) {
@@ -111,7 +114,6 @@ START_TEST(test_GpuArray_reduce) {
   if (comm_rank == ROOT_RANK) {
     err = GpuArray_read(RES, outsize, &RESdev);
     ck_assert_int_eq(err, GA_NO_ERROR);
-    int res;
     COUNT_ERRORS(RES, EXP, ROWS, COLS, res);
     ck_assert_msg(res == 0,
                   "GpuArray_reduce with %s op produced errors in %d places",
@@ -128,6 +130,7 @@ END_TEST
  * aligned`.
  */
 START_TEST(test_GpuArray_all_reduce) {
+  int res;
   INIT_ARRAYS(ROWS, COLS, ROWS, COLS);
 
   err = GpuArray_all_reduce(&Adev, &RESdev, GA_SUM, comm);
@@ -140,7 +143,6 @@ START_TEST(test_GpuArray_all_reduce) {
 
   err = GpuArray_read(RES, outsize, &RESdev);
   ck_assert_int_eq(err, GA_NO_ERROR);
-  int res;
   COUNT_ERRORS(RES, EXP, ROWS, COLS, res);
   ck_assert_msg(res == 0,
                 "GpuArray_all_reduce with %s op produced errors in %d places",
@@ -155,6 +157,8 @@ END_TEST
  * aligned`.
  */
 START_TEST(test_GpuArray_reduce_scatter) {
+  int res;
+  int* recvcounts;
   // In order for C contiguous arrays to be combined/split successfully they
   // should
   // split along the smallest axis (the one with the bigger stride).
@@ -165,7 +169,7 @@ START_TEST(test_GpuArray_reduce_scatter) {
   GpuArray_sync(&RESdev);
   GpuArray_sync(&Adev);
 
-  int* recvcounts = (int*)malloc(comm_ndev * sizeof(int));
+  recvcounts = (int*)malloc(comm_ndev * sizeof(int));
   if (recvcounts == NULL)
     ck_abort_msg("system memory allocation failed");
   for (i = 0; i < (size_t)comm_ndev; ++i)
@@ -177,7 +181,6 @@ START_TEST(test_GpuArray_reduce_scatter) {
 
   err = GpuArray_read(RES, outsize, &RESdev);
   ck_assert_int_eq(err, GA_NO_ERROR);
-  int res;
   COUNT_ERRORS(RES, EXP, ROWS / comm_ndev, COLS, res);
   ck_assert_msg(
       res == 0,
@@ -192,6 +195,7 @@ END_TEST
  * \note Untested for `not aligned`.
  */
 START_TEST(test_GpuArray_broadcast) {
+  int res;
   INIT_ARRAYS(ROWS, COLS, ROWS, COLS);
 
   for (i = 0; i < indims[0]; ++i)
@@ -207,7 +211,6 @@ START_TEST(test_GpuArray_broadcast) {
 
   err = GpuArray_read(RES, outsize, &Adev);
   ck_assert_int_eq(err, GA_NO_ERROR);
-  int res;
   COUNT_ERRORS(RES, EXP, ROWS, COLS, res);
   ck_assert_msg(res == 0, "GpuArray_broadcast produced errors in %d places",
                 res);
@@ -221,6 +224,7 @@ END_TEST
  * aligned`.
  */
 START_TEST(test_GpuArray_all_gather) {
+  int res;
   // In order for C contiguous arrays to be combined/split successfully they
   // should
   // split along the smallest axis (the one with the bigger stride).
@@ -237,7 +241,6 @@ START_TEST(test_GpuArray_all_gather) {
 
   err = GpuArray_read(RES, outsize, &RESdev);
   ck_assert_int_eq(err, GA_NO_ERROR);
-  int res;
   COUNT_ERRORS(RES, EXP, ROWS, COLS, res);
   ck_assert_msg(res == 0, "GpuArray_all_gather produced errors in %d places",
                 res);
diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c
index d8893b8496..c65142514e 100644
--- a/tests/check_elemwise.c
+++ b/tests/check_elemwise.c
@@ -6,6 +6,26 @@
 #include "gpuarray/error.h"
 #include "gpuarray/types.h"
 
+#if CHECK_MINOR_VERSION < 11
+
+#ifndef CK_FLOATING_DIG
+# define CK_FLOATING_DIG 6
+#endif /* CK_FLOATING_DIG */
+
+#define _ck_assert_floating(X, OP, Y, TP, TM) do { \
+    TP _ck_x = (X);                                \
+    TP _ck_y = (Y);                                \
+    ck_assert_msg(_ck_x OP _ck_y,                                  \
+                  "Assertion '%s' failed: %s == %.*"TM"g, %s == %.*"TM"g", \
+                  #X" "#OP" "#Y,                                        \
+                  #X, (int)CK_FLOATING_DIG, _ck_x,                      \
+                  #Y, (int)CK_FLOATING_DIG, _ck_y);                     \
+  } while (0)
+
+#define ck_assert_float_eq(X, Y) _ck_assert_floating(X, ==, Y, float, "")
+#endif
+
+
 extern void *ctx;
 
 void setup(void);
@@ -80,19 +100,18 @@ START_TEST(test_contig_f16) {
   GpuElemwise *ge;
 
   static uint16_t data1[3];
+  static uint16_t data2[3];
+  uint16_t data3[3] = {0};
+  size_t dims[1];
+  gpuelemwise_arg args[3] = {{0}};
+  void *rargs[3];
+
   data1[0] = F16[1];
   data1[1] = F16[2];
   data1[2] = F16[3];
-  static uint16_t data2[3];
   data2[0] = F16[4];
   data2[1] = F16[5];
   data2[2] = F16[6];
-  uint16_t data3[3] = {0};
-
-  size_t dims[1];
-
-  gpuelemwise_arg args[3] = {{0}};
-  void *rargs[3];
 
   dims[0] = 3;
 
@@ -243,19 +262,19 @@ START_TEST(test_basic_f16) {
   GpuElemwise *ge;
 
   static uint16_t data1[3];
+  static uint16_t data2[3];
+  uint16_t data3[3] = {0};
+  size_t dims[2];
+  gpuelemwise_arg args[3] = {{0}};
+  void *rargs[3];
+
   data1[0] = F16[1];
   data1[1] = F16[2];
   data1[2] = F16[3];
-  static uint16_t data2[3];
   data2[0] = F16[4];
   data2[1] = F16[5];
   data2[2] = F16[6];
-  uint16_t data3[3] = {0};
 
-  size_t dims[2];
-
-  gpuelemwise_arg args[3] = {{0}};
-  void *rargs[3];
 
   dims[0] = 1;
   dims[1] = 3;
@@ -322,6 +341,7 @@ START_TEST(test_basic_offset) {
   /* Simulate indexing */
   a.offset = 12;
   a.dimensions[1] = 3;
+  GpuArray_fix_flags(&a);
 
   ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1)));
 
@@ -362,6 +382,132 @@ START_TEST(test_basic_offset) {
 }
 END_TEST
 
+START_TEST(test_basic_scalar) {
+  GpuArray a;
+  GpuArray b;
+  GpuArray c;
+  uint32_t x = 2;
+
+  GpuElemwise *ge;
+
+  static const uint32_t data1[3] = {1, 2, 3};
+  static const uint32_t data2[2] = {4, 5};
+  uint32_t data3[6] = {0};
+
+  size_t dims[2];
+
+  gpuelemwise_arg args[4] = {{0}};
+  void *rargs[4];
+
+  dims[0] = 1;
+  dims[1] = 3;
+
+  ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1)));
+
+  dims[0] = 2;
+  dims[1] = 1;
+
+  ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER));
+  ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2)));
+
+  dims[1] = 3;
+
+  ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER));
+
+  args[0].name = "a";
+  args[0].typecode = GA_UINT;
+  args[0].flags = GE_READ;
+
+  args[1].name = "x";
+  args[1].typecode = GA_UINT;
+  args[1].flags = GE_SCALAR;
+
+  args[2].name = "b";
+  args[2].typecode = GA_UINT;
+  args[2].flags = GE_READ;
+
+  args[3].name = "c";
+  args[3].typecode = GA_UINT;
+  args[3].flags = GE_WRITE;
+
+  ge = GpuElemwise_new(ctx, "", "c = a + x * b", 4, args, 2, 0);
+
+  ck_assert_ptr_ne(ge, NULL);
+
+  rargs[0] = &a;
+  rargs[1] = &x;
+  rargs[2] = &b;
+  rargs[3] = &c;
+
+  ga_assert_ok(GpuElemwise_call(ge, rargs, GE_BROADCAST));
+
+  ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c));
+
+  ck_assert_int_eq(data3[0], 9);
+  ck_assert_int_eq(data3[1], 10);
+  ck_assert_int_eq(data3[2], 11);
+
+  ck_assert_int_eq(data3[3], 11);
+  ck_assert_int_eq(data3[4], 12);
+  ck_assert_int_eq(data3[5], 13);
+}
+END_TEST
+
+START_TEST(test_basic_scalar_dtype) {
+  GpuArray x;
+  GpuArray y;
+  float a = 1.1f;
+
+  GpuElemwise *ge;
+
+  static const int32_t data1[4] = {0, 1, 2, 3};
+  static const float data2[4] = {2.0, 2.0, 2.0, 2.0};
+  float data3[4];
+
+  size_t dims[2] = {2, 2};
+
+  gpuelemwise_arg args[3] = {{0}};
+  void *rargs[3];
+
+  ga_assert_ok(GpuArray_empty(&x, ctx, GA_INT, 2, dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_write(&x, data1, sizeof(data1)));
+
+  ga_assert_ok(GpuArray_empty(&y, ctx, GA_FLOAT, 2, dims, GA_F_ORDER));
+  ga_assert_ok(GpuArray_write(&y, data2, sizeof(data2)));
+
+  args[0].name = "a";
+  args[0].typecode = GA_FLOAT;
+  args[0].flags = GE_SCALAR;
+
+  args[1].name = "x";
+  args[1].typecode = GA_INT;
+  args[1].flags = GE_READ;
+
+  args[2].name = "y";
+  args[2].typecode = GA_FLOAT;
+  args[2].flags = GE_READ|GE_WRITE;
+
+  ge = GpuElemwise_new(ctx, "", "y = a * x + y", 3, args, 2, 0);
+
+  ck_assert_ptr_ne(ge, NULL);
+
+  rargs[0] = &a;
+  rargs[1] = &x;
+  rargs[2] = &y;
+
+  ga_assert_ok(GpuElemwise_call(ge, rargs, 0));
+
+  ga_assert_ok(GpuArray_read(data3, sizeof(data3), &y));
+
+  ck_assert_float_eq(data3[0], 2.0f);
+  ck_assert_float_eq(data3[1], 4.2f);
+
+  ck_assert_float_eq(data3[2], 3.1f);
+  ck_assert_float_eq(data3[3], 5.3f);
+}
+END_TEST
+
 START_TEST(test_basic_remove1) {
   GpuArray a;
   GpuArray b;
@@ -492,6 +638,73 @@ START_TEST(test_basic_broadcast) {
 }
 END_TEST
 
+START_TEST(test_basic_padshape) {
+  GpuArray a;
+  GpuArray b;
+  GpuArray c;
+
+  GpuElemwise *ge;
+
+  static const uint32_t data1[3] = {1, 2, 3};
+  static const uint32_t data2[2] = {4, 5};
+  uint32_t data3[6] = {0};
+
+  size_t dims[2];
+
+  gpuelemwise_arg args[3] = {{0}};
+  void *rargs[3];
+
+  dims[0] = 3;
+
+  ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 1, dims, GA_C_ORDER));
+  ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1)));
+
+  dims[0] = 2;
+  dims[1] = 1;
+
+  ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER));
+  ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2)));
+
+  dims[0] = 2;
+  dims[1] = 3;
+
+  ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER));
+
+  args[0].name = "a";
+  args[0].typecode = GA_UINT;
+  args[0].flags = GE_READ;
+
+  args[1].name = "b";
+  args[1].typecode = GA_UINT;
+  args[1].flags = GE_READ;
+
+  args[2].name = "c";
+  args[2].typecode = GA_UINT;
+  args[2].flags = GE_WRITE;
+
+  ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0);
+
+  ck_assert_ptr_ne(ge, NULL);
+
+  rargs[0] = &a;
+  rargs[1] = &b;
+  rargs[2] = &c;
+
+  ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE), GA_VALUE_ERROR);
+
+  ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST | GE_PADSHAPE));
+
+  ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c));
+
+  ck_assert_int_eq(data3[0], 5);
+  ck_assert_int_eq(data3[1], 6);
+  ck_assert_int_eq(data3[2], 7);
+  ck_assert_int_eq(data3[3], 6);
+  ck_assert_int_eq(data3[4], 7);
+  ck_assert_int_eq(data3[5], 8);
+}
+END_TEST
+
 START_TEST(test_basic_collapse) {
   GpuArray a;
   GpuArray b;
@@ -680,9 +893,12 @@ Suite *get_suite(void) {
   tcase_add_checked_fixture(tc, setup, teardown);
   tcase_add_test(tc, test_basic_simple);
   tcase_add_test(tc, test_basic_f16);
+  tcase_add_test(tc, test_basic_scalar);
+  tcase_add_test(tc, test_basic_scalar_dtype);
   tcase_add_test(tc, test_basic_offset);
   tcase_add_test(tc, test_basic_remove1);
   tcase_add_test(tc, test_basic_broadcast);
+  tcase_add_test(tc, test_basic_padshape);
   tcase_add_test(tc, test_basic_collapse);
   tcase_add_test(tc, test_basic_neg_strides);
   tcase_add_test(tc, test_basic_0);
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
new file mode 100644
index 0000000000..ca3f231bf4
--- /dev/null
+++ b/tests/check_reduction.c
@@ -0,0 +1,447 @@
+#include <check.h>
+
+#include <gpuarray/buffer.h>
+#include <gpuarray/array.h>
+#include <gpuarray/error.h>
+#include <gpuarray/types.h>
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+
+extern void *ctx;
+
+void setup(void);
+void teardown(void);
+
+
+/* Defines */
+#define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR)
+
+
+
+
+/**
+ * PRNG based on PCG XSH RR 64/32 (LCG)
+ * 
+ * Used to generate random data for the kernel tests.
+ */
+
+/* Forward Declarations */
+static       uint32_t pcgRor32 (uint32_t x, uint32_t n);
+static       void     pcgSeed  (uint64_t seed);
+static       uint32_t pcgRand  (void);
+static       double   pcgRand01(void);
+/* Definitions */
+static       uint64_t pcgS =                   1;/* State */
+static const uint64_t pcgM = 6364136223846793005;/* Multiplier */
+static const uint64_t pcgA = 1442695040888963407;/* Addend */
+static       uint32_t pcgRor32 (uint32_t x, uint32_t n){
+	return (n &= 0x1F) ? x>>n | x<<(32-n) : x;
+}
+static       void     pcgSeed  (uint64_t seed){
+	pcgS = seed;
+}
+static       uint32_t pcgRand  (void){
+	pcgS = pcgS*pcgM + pcgA;
+	
+	/**
+	 * PCG does something akin to an unbalanced Feistel round to blind the LCG
+	 * state:
+	 * 
+	 * The rightmost 59 bits are involved in an xorshift by 18.
+	 * The leftmost   5 bits select a rotation of the 32 bits 58:27.
+	 */
+	
+	return pcgRor32((pcgS^(pcgS>>18))>>27, pcgS>>59);
+}
+static       double   pcgRand01(void){
+	uint64_t u = pcgRand(), l = pcgRand();
+	uint64_t x = u<<32 | l;
+	return x /18446744073709551616.0;
+}
+
+
+/**
+ * Test cases.
+ */
+
+START_TEST(test_reduction){
+	/**
+	 * We test here a reduction of some random 3D tensor on the first and
+	 * third dimensions.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	float *pSrc = calloc(sizeof(*pSrc), prodDims);
+	float *pMax = calloc(sizeof(*pMax), dims[1]);
+	unsigned long *pArgmax = calloc(sizeof(*pArgmax), dims[1]);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	pcgSeed(1);
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+
+	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		size_t gtArgmax = 0;
+		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v > gtMax){
+					gtMax    = v;
+					gtArgmax = i*dims[2] + k;
+				}
+			}
+		}
+
+		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
+		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	free(pArgmax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+	GpuArray_clear(&gaArgmax);
+}END_TEST
+
+START_TEST(test_idxtranspose){
+	/**
+	 * We test here the same reduction as test_reduction, except with a
+	 * reversed reduxList {2,0} instead of {0,2}. That should lead to a
+	 * transposition of the argmax "coordinates" and thus a change in its
+	 * "flattened" output version.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+	size_t i,j,k;
+	size_t dims[3]     = {32,50,79};
+	size_t prodDims    = dims[0]*dims[1]*dims[2];
+	size_t rdxDims[1]  = {50};
+	size_t rdxProdDims = rdxDims[0];
+	const unsigned reduxList[] = {2,0};
+
+	float *pSrc = calloc(sizeof(*pSrc), prodDims);
+	float *pMax = calloc(sizeof(*pMax), rdxProdDims);
+	unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	pcgSeed(1);
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  1, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+
+	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		size_t gtArgmax = 0;
+		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+
+		for(k=0;k<dims[2];k++){
+			for(i=0;i<dims[0];i++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v > gtMax){
+					gtMax    = v;
+					gtArgmax = k*dims[0] + i;
+				}
+			}
+		}
+
+		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
+		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	free(pArgmax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+	GpuArray_clear(&gaArgmax);
+}END_TEST
+
+START_TEST(test_veryhighrank){
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+	size_t dstIdx;
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	float *pSrc = calloc(sizeof(*pSrc), prodDims);
+	float *pMax = calloc(sizeof(*pMax), rdxProdDims);
+	unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	pcgSeed(1);
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+
+	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					size_t gtArgmax = 0;
+					float  gtMax    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+
+									if(v > gtMax){
+										gtMax    = v;
+										gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									}
+								}
+							}
+						}
+					}
+
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_msg(gtMax    == pMax[dstIdx],    "Max value mismatch!");
+					ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!");
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	free(pArgmax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+	GpuArray_clear(&gaArgmax);
+}END_TEST
+
+START_TEST(test_alldimsreduced){
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+	size_t gtArgmax;
+	float  gtMax;
+
+	float *pSrc    = calloc(sizeof(*pSrc), prodDims);
+	float *pMax    = calloc(1, sizeof(*pMax));
+	unsigned long *pArgmax = calloc(1, sizeof(*pArgmax));
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	pcgSeed(1);
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+
+	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	gtArgmax = 0;
+	gtMax    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v > gtMax){
+					gtMax    = v;
+					gtArgmax = (i*dims[1] + j)*dims[2] + k;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtMax    == pMax[0],    "Max value mismatch!");
+	ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	free(pArgmax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+	GpuArray_clear(&gaArgmax);
+}END_TEST
+
+Suite *get_suite(void) {
+	Suite *s  = suite_create("reduction");
+	TCase *tc = tcase_create("basic");
+	tcase_add_checked_fixture(tc, setup, teardown);
+	tcase_set_timeout(tc, 15.0);
+
+	tcase_add_test(tc, test_reduction);
+	tcase_add_test(tc, test_idxtranspose);
+	tcase_add_test(tc, test_veryhighrank);
+	tcase_add_test(tc, test_alldimsreduced);
+
+	suite_add_tcase(s, tc);
+	return s;
+}
+
diff --git a/tests/check_util.c b/tests/check_util.c
index 76885f8e8c..bcdde668a5 100644
--- a/tests/check_util.c
+++ b/tests/check_util.c
@@ -118,12 +118,42 @@ START_TEST(test_elemwise_collapse) {
 }
 END_TEST
 
+START_TEST(test_float2half) {
+  const float f[] = {
+    2.9831426e-08f,
+    2e-25f,
+    2e-26f,
+    1.0005035f,
+    1.0002441f,
+    65519.f,
+    65520.f,
+  };
+  const ga_half_t h[] = {
+    {0x0001u}, /* 2e-24 */
+    {0x0000u}, /* 0 */
+    {0x0000u}, /* 0 */
+    {0x3c01u}, /* 1.0 + 2e-10 */
+    {0x3c00u}, /* 1.0 */
+    {0x7bffu}, /* 65504 */
+    {0x7c00u}, /* Inf */
+  };
+  unsigned int i;
+  ga_half_t hr;
+
+  for (i = 0; i < sizeof(f)/sizeof(f[0]); i++) {
+    hr = ga_float2half(f[i]);
+    ck_assert_int_eq(hr.h, h[i].h);
+  }
+}
+END_TEST
+
 Suite *get_suite(void) {
   Suite *s = suite_create("util");
   TCase *tc = tcase_create("All");
   tcase_add_test(tc, test_register_type);
   tcase_add_test(tc, test_type_flags);
   tcase_add_test(tc, test_elemwise_collapse);
+  tcase_add_test(tc, test_float2half);
   suite_add_tcase(s, tc);
   return s;
 }
diff --git a/tests/check_util_integerfactoring.c b/tests/check_util_integerfactoring.c
new file mode 100644
index 0000000000..08d1b17869
--- /dev/null
+++ b/tests/check_util_integerfactoring.c
@@ -0,0 +1,471 @@
+/* Includes */
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <check.h>
+#include <gpuarray/util.h>
+#include "util/integerfactoring.h"
+
+
+/**
+ * Primality Checker
+ */
+
+START_TEST(test_primalitychecker){
+	/* Tiny numbers */
+	ck_assert(!gaIIsPrime(                   0ULL));
+	ck_assert(!gaIIsPrime(                   1ULL));
+	ck_assert( gaIIsPrime(                   2ULL));
+	ck_assert( gaIIsPrime(                   3ULL));
+	ck_assert(!gaIIsPrime(                   4ULL));
+	ck_assert( gaIIsPrime(                   5ULL));
+	ck_assert(!gaIIsPrime(                   6ULL));
+	ck_assert( gaIIsPrime(                   7ULL));
+	ck_assert(!gaIIsPrime(                   8ULL));
+	ck_assert(!gaIIsPrime(                   9ULL));
+	ck_assert(!gaIIsPrime(                  10ULL));
+	ck_assert( gaIIsPrime(                  11ULL));
+	ck_assert(!gaIIsPrime(                  12ULL));
+	ck_assert( gaIIsPrime(                  13ULL));
+	ck_assert(!gaIIsPrime(                  14ULL));
+	ck_assert(!gaIIsPrime(                  15ULL));
+	ck_assert(!gaIIsPrime(                  16ULL));
+	ck_assert( gaIIsPrime(                  17ULL));
+	ck_assert(!gaIIsPrime(                  18ULL));
+	ck_assert( gaIIsPrime(                  19ULL));
+	ck_assert(!gaIIsPrime(                  20ULL));
+	/* Small primes */
+	ck_assert( gaIIsPrime(                4987ULL));
+	ck_assert( gaIIsPrime(                4993ULL));
+	ck_assert( gaIIsPrime(                4999ULL));
+	/* Squares of primes */
+	ck_assert(!gaIIsPrime(            24870169ULL));
+	ck_assert(!gaIIsPrime(            24930049ULL));
+	ck_assert(!gaIIsPrime(            24990001ULL));
+	/* Catalan pseudoprimes */
+	ck_assert(!gaIIsPrime(                5907ULL));
+	ck_assert(!gaIIsPrime(             1194649ULL));
+	ck_assert(!gaIIsPrime(            12327121ULL));
+	/* Fermat base-2 pseudoprimes */
+	ck_assert(!gaIIsPrime(                 341ULL));
+	ck_assert(!gaIIsPrime(                 561ULL));
+	ck_assert(!gaIIsPrime(                 645ULL));
+	ck_assert(!gaIIsPrime(                1105ULL));
+	ck_assert(!gaIIsPrime(                1387ULL));
+	ck_assert(!gaIIsPrime(                1729ULL));
+	ck_assert(!gaIIsPrime(                1905ULL));
+	ck_assert(!gaIIsPrime(                2047ULL));
+	ck_assert(!gaIIsPrime(                2465ULL));
+	ck_assert(!gaIIsPrime(              486737ULL));
+	/* Strong Lucas pseudoprimes */
+	ck_assert(!gaIIsPrime(                5459ULL));
+	ck_assert(!gaIIsPrime(                5459ULL));
+	ck_assert(!gaIIsPrime(                5459ULL));
+	ck_assert(!gaIIsPrime(                5777ULL));
+	ck_assert(!gaIIsPrime(               10877ULL));
+	ck_assert(!gaIIsPrime(               16109ULL));
+	ck_assert(!gaIIsPrime(               18971ULL));
+	ck_assert(!gaIIsPrime(               22499ULL));
+	ck_assert(!gaIIsPrime(               24569ULL));
+	ck_assert(!gaIIsPrime(               25199ULL));
+	ck_assert(!gaIIsPrime(               40309ULL));
+	ck_assert(!gaIIsPrime(               58519ULL));
+	ck_assert(!gaIIsPrime(               75077ULL));
+	ck_assert(!gaIIsPrime(               97439ULL));
+	ck_assert(!gaIIsPrime(              100127ULL));
+	ck_assert(!gaIIsPrime(              113573ULL));
+	ck_assert(!gaIIsPrime(              115639ULL));
+	ck_assert(!gaIIsPrime(              130139ULL));
+	/* Medium, prime. */
+	ck_assert( gaIIsPrime(          2100000011ULL));
+	ck_assert( gaIIsPrime(          2100000017ULL));
+	/* Large, non-smooth, composite */
+	ck_assert(!gaIIsPrime( 2196095973992233039ULL));
+	/* Largest prime < 2**64: */
+	ck_assert( gaIIsPrime(18446744073709551557ULL));
+	/* Largest integers */
+	ck_assert(!gaIIsPrime(18446744073709551613ULL));
+	ck_assert(!gaIIsPrime(18446744073709551614ULL));
+	ck_assert(!gaIIsPrime(18446744073709551615ULL));
+}END_TEST
+
+/**
+ * Integer Factorization test
+ */
+
+START_TEST(test_integerfactorization){
+	ga_factor_list fl;
+	uint64_t       n;
+
+	/**
+	 * Attempt exact factorization for 2^64-1, no k-smoothness constraint.
+	 * Expected PASS with 3*5*17*257*641*65537*6700417
+	 */
+
+	n = 18446744073709551615ULL;
+	ck_assert_int_ne (gaIFactorize(n,         0,     0, &fl), 0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    3ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    5ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                   17ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                  257ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                  641ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                65537ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,              6700417ULL),  1);
+	ck_assert_uint_eq(gaIFLGetProduct(&fl), n);
+
+	/**
+	 * Attempt exact factorization for 2^64-1, 4096-smooth constraint.
+	 * Expected FAIL, because 2^64-1 possesses prime factors in excess of 4096.
+	 */
+
+	n = 18446744073709551615ULL;
+	ck_assert_int_eq (gaIFactorize(n,         0,  4096, &fl), 0);
+
+	/**
+	 * Attempt approximate factorization for 2^64-1, no k-smoothness constraint.
+	 * Unlimited growth permitted.
+	 * Expected PASS, since 2^64-1 rounds up to 2^64 and 2^64 trivially factorizes.
+	 */
+
+	n = 18446744073709551615ULL;
+	ck_assert_int_ne (gaIFactorize(n,        -1,     0, &fl), 0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    2ULL), 64);
+	ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2);
+	ck_assert_int_ne (gaIFLIsOverflowed(&fl), 0);
+
+	/**
+	 * Attempt exact factorization for 2196095973992233039, no k-smoothness constraint.
+	 * 2196095973992233039 is a large, highly non-smooth number, with three enormous
+	 * factors.
+	 * Expected PASS *very quickly*, since it factorizes as 1299817*1299821*1299827
+	 */
+
+	n =  2196095973992233039ULL;
+	ck_assert_int_ne (gaIFactorize(n,         0,     0, &fl), 0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,              1299817ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,              1299821ULL),  1);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,              1299827ULL),  1);
+	ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 1299827);
+	ck_assert_uint_eq(gaIFLGetProduct(&fl), n);
+
+	/**
+	 * Attempt approximate factorization for 2196095973992233039, 16-smooth constraint.
+	 * 2196095973992233039 is a large, highly non-smooth number, with three enormous
+	 * factors. It is not 64-smooth, so code paths that attempt approximate
+	 * factorization within the growth limits (.005%) are exercised.
+	 *
+	 * Expected PASS *relatively quickly*.
+	 */
+
+	n =  2196095973992233039ULL;
+	ck_assert_int_ne (gaIFactorize(n, n*1.00005,    16, &fl), 0);
+	ck_assert_uint_ge(gaIFLGetProduct(&fl), n);
+	ck_assert_uint_le(gaIFLGetProduct(&fl), n*1.00005);
+
+	/**
+	 * Attempt exact factorization of 7438473388800000000, 5-smooth constraint.
+	 * It is a large, 5-smooth number. This should exercise the 5-smooth
+	 * factorization path.
+	 */
+
+	n =  7438473388800000000ULL;
+	ck_assert_int_ne (gaIFactorize(n,         0,     5, &fl), 0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    2ULL), 14);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    3ULL), 19);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    5ULL),  8);
+	ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5);
+	ck_assert_uint_eq(gaIFLGetProduct(&fl), n);
+
+	/**
+	 * Attempt approximate factorization of 7438473388799999997, 2-smooth constraint.
+	 * It is a large, non-smooth number. This should exercise the optimal 2-smooth
+	 * factorizer in spite of the available, unlimited slack.
+	 */
+
+	n =  7438473388799999997ULL;
+	ck_assert_int_ne (gaIFactorize(n,        -1,      2, &fl), 0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    2ULL), 63);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    3ULL),  0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    5ULL),  0);
+	ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2);
+	ck_assert_uint_eq(gaIFLGetProduct(&fl),  9223372036854775808ULL);
+
+	/**
+	 * Attempt approximate factorization of 7438473388799999997, 3-smooth constraint.
+	 * It is a large, non-smooth number. This should exercise the optimal 3-smooth
+	 * factorizer in spite of the available, unlimited slack.
+	 */
+
+	n =  7438473388799999997ULL;
+	ck_assert_int_ne (gaIFactorize(n,        -1,      3, &fl), 0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    2ULL), 31);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    3ULL), 20);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    5ULL),  0);
+	ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 3);
+	ck_assert_uint_eq(gaIFLGetProduct(&fl),  7487812485248974848ULL);
+
+	/**
+	 * Attempt approximate factorization of 7438473388799999997, 5-smooth constraint.
+	 * It is a large, non-smooth number, but 3 integers above it is a 5-smooth
+	 * integer, 7438473388800000000. This should exercise the optimal 5-smooth
+	 * factorizer in spite of the available, unlimited slack.
+	 */
+
+	n =  7438473388799999997ULL;
+	ck_assert_int_ne (gaIFactorize(n,        -1,     5, &fl), 0);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    2ULL), 14);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    3ULL), 19);
+	ck_assert_int_eq (gaIFLGetFactorPower(&fl,                    5ULL),  8);
+	ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5);
+	ck_assert_uint_eq(gaIFLGetProduct(&fl), 7438473388800000000ULL);
+
+	/**
+	 * Toughest challenge: Attempt very tight approximate factorization of
+	 * 9876543210987654321 with .01% slack and 43-smooth constraint.
+	 *
+	 * This forces a bypass of the optimal 5-smooth factorizers and heavily
+	 * exercises the nextI:, subfactorize:, primetest: and newX jumps and
+	 * calculations.
+	 *
+	 * Expected PASS, "reasonably fast".
+	 */
+
+	n =  9876543210987654321ULL;
+	ck_assert_int_ne (gaIFactorize(n, n*1.0001,    43, &fl), 0);
+	ck_assert_uint_ge(gaIFLGetProduct(&fl), n);
+	ck_assert_uint_le(gaIFLGetProduct(&fl), n*1.0001);
+	ck_assert_uint_le(gaIFLGetGreatestFactor(&fl), 43);
+}END_TEST
+
+START_TEST(test_scheduler){
+	/* We use here the CUDA limits of a CC 3.0 GPU as an example. */
+	uint64_t maxBTot  =       1024, maxBInd[] = {      1024,      1024,        64},
+	         maxGTot  = 0xFFFFFFFF, maxGInd[] = {2147483647,     65535,     65535},
+	         warpSize =         32;
+
+	int                warpAxis;
+	uint64_t           dims[3];
+	ga_factor_list     factBS[3], factGS[3], factCS[3];
+	unsigned long long intbBS[3], intbGS[3], intbCS[3];
+	unsigned long long intaBS[3], intaGS[3], intaCS[3];
+
+	/**
+	 * NOTE: If you want to view befores-and-afters of scheduling, #define PRINT
+	 *       to something non-0.
+	 */
+#define PRINT 0
+
+	/**
+	 *
+	 * Testcase: (895,1147,923) job, warpSize on axis 0.
+	 *
+	 */
+
+	{
+		warpAxis       =          0;
+		dims[0]        =        895;
+		dims[1]        =       1141;
+		dims[2]        =        923;
+		dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize;
+
+		/**
+		 * Factorization job must be successful.
+		 */
+
+		ck_assert(gaIFactorize(warpAxis==0?warpSize:1,           0, maxBInd[0], factBS+0));
+		ck_assert(gaIFactorize(warpAxis==1?warpSize:1,           0, maxBInd[1], factBS+1));
+		ck_assert(gaIFactorize(warpAxis==2?warpSize:1,           0, maxBInd[2], factBS+2));
+		ck_assert(gaIFactorize(                     1,           0, maxBInd[0], factGS+0));
+		ck_assert(gaIFactorize(                     1,           0, maxBInd[1], factGS+1));
+		ck_assert(gaIFactorize(                     1,           0, maxBInd[2], factGS+2));
+		ck_assert(gaIFactorize(               dims[0], dims[0]*1.1, maxBInd[0], factCS+0));
+		ck_assert(gaIFactorize(               dims[1], dims[1]*1.1, maxBInd[1], factCS+1));
+		ck_assert(gaIFactorize(               dims[2], dims[2]*1.1, maxBInd[2], factCS+2));
+
+		intbBS[0] = gaIFLGetProduct(factBS+0);
+		intbBS[1] = gaIFLGetProduct(factBS+1);
+		intbBS[2] = gaIFLGetProduct(factBS+2);
+		intbGS[0] = gaIFLGetProduct(factGS+0);
+		intbGS[1] = gaIFLGetProduct(factGS+1);
+		intbGS[2] = gaIFLGetProduct(factGS+2);
+		intbCS[0] = gaIFLGetProduct(factCS+0);
+		intbCS[1] = gaIFLGetProduct(factCS+1);
+		intbCS[2] = gaIFLGetProduct(factCS+2);
+
+		/**
+		 * Ensure that factorization only *increases* the size of the problem.
+		 */
+
+		ck_assert_uint_ge(intbCS[0], dims[0]);
+		ck_assert_uint_ge(intbCS[1], dims[1]);
+		ck_assert_uint_ge(intbCS[2], dims[2]);
+
+
+		/**
+		 * Run scheduler.
+		 */
+
+#if PRINT
+		printf("Before:\n");
+		printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]);
+		printf("GS: (%6llu, %6llu, %6llu)\n", intbGS[0], intbGS[1], intbGS[2]);
+		printf("CS: (%6llu, %6llu, %6llu)\n", intbCS[0], intbCS[1], intbCS[2]);
+#endif
+		gaIFLSchedule(3, maxBTot, maxBInd, maxGTot, maxGInd, factBS, factGS, factCS);
+		intaBS[0] = gaIFLGetProduct(factBS+0);
+		intaBS[1] = gaIFLGetProduct(factBS+1);
+		intaBS[2] = gaIFLGetProduct(factBS+2);
+		intaGS[0] = gaIFLGetProduct(factGS+0);
+		intaGS[1] = gaIFLGetProduct(factGS+1);
+		intaGS[2] = gaIFLGetProduct(factGS+2);
+		intaCS[0] = gaIFLGetProduct(factCS+0);
+		intaCS[1] = gaIFLGetProduct(factCS+1);
+		intaCS[2] = gaIFLGetProduct(factCS+2);
+#if PRINT
+		printf("After:\n");
+		printf("BS: (%6llu, %6llu, %6llu)\n", intaBS[0], intaBS[1], intaBS[2]);
+		printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]);
+		printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]);
+#endif
+
+		/**
+		 * Scheduling is only about moving factors between block/grid/chunk factor
+		 * lists. Therefore, the three dimensions must not have changed size.
+		 */
+
+		ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]);
+		ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]);
+		ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]);
+
+		/**
+		 * Verify that the individual limits and global limits on threads in a
+		 * block and blocks in a grid are met.
+		 */
+
+		ck_assert_uint_le(intaBS[0],                     maxBInd[0]);
+		ck_assert_uint_le(intaBS[1],                     maxBInd[1]);
+		ck_assert_uint_le(intaBS[2],                     maxBInd[2]);
+		ck_assert_uint_le(intaGS[0],                     maxGInd[0]);
+		ck_assert_uint_le(intaGS[1],                     maxGInd[1]);
+		ck_assert_uint_le(intaGS[2],                     maxGInd[2]);
+		ck_assert_uint_le(intaBS[0]*intaBS[1]*intaBS[2], maxBTot);
+		ck_assert_uint_le(intaGS[0]*intaGS[1]*intaGS[2], maxGTot);
+	}
+
+
+	/**
+	 *
+	 * Testcase: (1,1,121632959) job, warpSize on axis 2.
+	 *
+	 */
+
+	{
+		warpAxis       =         2;
+		dims[0]        =         1;
+		dims[1]        =         1;
+		dims[2]        = 121632959;
+		dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize;
+
+		/**
+		 * Factorization job must be successful.
+		 */
+
+		ck_assert(gaIFactorize(warpAxis==0?warpSize:1,           0, maxBInd[0], factBS+0));
+		ck_assert(gaIFactorize(warpAxis==1?warpSize:1,           0, maxBInd[1], factBS+1));
+		ck_assert(gaIFactorize(warpAxis==2?warpSize:1,           0, maxBInd[2], factBS+2));
+		ck_assert(gaIFactorize(                     1,           0, maxBInd[0], factGS+0));
+		ck_assert(gaIFactorize(                     1,           0, maxBInd[1], factGS+1));
+		ck_assert(gaIFactorize(                     1,           0, maxBInd[2], factGS+2));
+		ck_assert(gaIFactorize(               dims[0], dims[0]*1.1, maxBInd[0], factCS+0));
+		ck_assert(gaIFactorize(               dims[1], dims[1]*1.1, maxBInd[1], factCS+1));
+		ck_assert(gaIFactorize(               dims[2], dims[2]*1.1, maxBInd[2], factCS+2));
+
+		intbBS[0] = gaIFLGetProduct(factBS+0);
+		intbBS[1] = gaIFLGetProduct(factBS+1);
+		intbBS[2] = gaIFLGetProduct(factBS+2);
+		intbGS[0] = gaIFLGetProduct(factGS+0);
+		intbGS[1] = gaIFLGetProduct(factGS+1);
+		intbGS[2] = gaIFLGetProduct(factGS+2);
+		intbCS[0] = gaIFLGetProduct(factCS+0);
+		intbCS[1] = gaIFLGetProduct(factCS+1);
+		intbCS[2] = gaIFLGetProduct(factCS+2);
+
+		/**
+		 * Ensure that factorization only *increases* the size of the problem.
+		 */
+
+		ck_assert_uint_ge(intbCS[0], dims[0]);
+		ck_assert_uint_ge(intbCS[1], dims[1]);
+		ck_assert_uint_ge(intbCS[2], dims[2]);
+
+
+		/**
+		 * Run scheduler.
+		 */
+
+#if PRINT
+		printf("Before:\n");
+		printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]);
+		printf("GS: (%6llu, %6llu, %6llu)\n", intbGS[0], intbGS[1], intbGS[2]);
+		printf("CS: (%6llu, %6llu, %6llu)\n", intbCS[0], intbCS[1], intbCS[2]);
+#endif
+		gaIFLSchedule(3, maxBTot, maxBInd, maxGTot, maxGInd, factBS, factGS, factCS);
+		intaBS[0] = gaIFLGetProduct(factBS+0);
+		intaBS[1] = gaIFLGetProduct(factBS+1);
+		intaBS[2] = gaIFLGetProduct(factBS+2);
+		intaGS[0] = gaIFLGetProduct(factGS+0);
+		intaGS[1] = gaIFLGetProduct(factGS+1);
+		intaGS[2] = gaIFLGetProduct(factGS+2);
+		intaCS[0] = gaIFLGetProduct(factCS+0);
+		intaCS[1] = gaIFLGetProduct(factCS+1);
+		intaCS[2] = gaIFLGetProduct(factCS+2);
+#if PRINT
+		printf("After:\n");
+		printf("BS: (%6llu, %6llu, %6llu)\n", intaBS[0], intaBS[1], intaBS[2]);
+		printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]);
+		printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]);
+#endif
+
+		/**
+		 * Scheduling is only about moving factors between block/grid/chunk factor
+		 * lists. Therefore, the three dimensions must not have changed size.
+		 */
+
+		ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]);
+		ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]);
+		ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]);
+
+		/**
+		 * Verify that the individual limits and global limits on threads in a
+		 * block and blocks in a grid are met.
+		 */
+
+		ck_assert_uint_le(intaBS[0],                     maxBInd[0]);
+		ck_assert_uint_le(intaBS[1],                     maxBInd[1]);
+		ck_assert_uint_le(intaBS[2],                     maxBInd[2]);
+		ck_assert_uint_le(intaGS[0],                     maxGInd[0]);
+		ck_assert_uint_le(intaGS[1],                     maxGInd[1]);
+		ck_assert_uint_le(intaGS[2],                     maxGInd[2]);
+		ck_assert_uint_le(intaBS[0]*intaBS[1]*intaBS[2], maxBTot);
+		ck_assert_uint_le(intaGS[0]*intaGS[1]*intaGS[2], maxGTot);
+	}
+}END_TEST
+
+
+
+Suite *get_suite(void){
+	Suite *s  = suite_create("util_integerfactoring");
+	TCase *tc = tcase_create("All");
+
+	tcase_set_timeout(tc, 10.0);
+
+	tcase_add_test(tc, test_primalitychecker);
+	tcase_add_test(tc, test_integerfactorization);
+	tcase_add_test(tc, test_scheduler);
+
+	suite_add_tcase(s, tc);
+
+	return s;
+}
+
diff --git a/tests/communicator.c b/tests/communicator.c
index d70aac1cd9..696e04fc86 100644
--- a/tests/communicator.c
+++ b/tests/communicator.c
@@ -21,11 +21,13 @@ extern void teardown(void);
  */
 void setup_comm(void)
 {
-  setup();
   int err;
+  gpucommCliqueId comm_id;
+
+  setup();
 
   MPI_Barrier(MPI_COMM_WORLD);
-  gpucommCliqueId comm_id;
+
   err = gpucomm_gen_clique_id(ctx, &comm_id);
   // Has successfully got a unique comm id.
   ck_assert_int_eq(err, GA_NO_ERROR);
diff --git a/tests/device.c b/tests/device.c
index 6c8382fe36..5bfe17f5ad 100644
--- a/tests/device.c
+++ b/tests/device.c
@@ -5,14 +5,15 @@
 #include <check.h>
 
 #include "gpuarray/buffer.h"
+#include "gpuarray/error.h"
 
 char* dev_name = NULL;
 
-int get_env_dev(const char **name) {
+int get_env_dev(const char **name, gpucontext_props *p) {
   char *dev = NULL;
   char *end;
   long no;
-  int d;
+  int pl;
   dev = dev_name;
   if (dev == NULL) {
     if ((dev = getenv("GPUARRAY_TEST_DEVICE")) == NULL) {
@@ -29,7 +30,8 @@ int get_env_dev(const char **name) {
       return -1;
     if (no < 0 || no > INT_MAX)
       return -1;
-    return (int)no;
+    gpucontext_props_cuda_dev(p, (int)no);
+    return 0;
   }
   if (strncmp(dev, "opencl", 6) == 0) {
     *name = "opencl";
@@ -38,16 +40,15 @@ int get_env_dev(const char **name) {
       return -1;
     if (no < 0 || no > 32768)
       return -1;
-    d = (int)no;
+    pl = (int)no;
     dev = end;
     no = strtol(dev + 1, &end, 10);
     if (end == dev || *end != '\0')
       return -1;
     if (no < 0 || no > 32768)
       return -1;
-    d <<= 16;
-    d |= (int)no;
-    return d;
+    gpucontext_props_opencl_dev(p, pl, (int)no);
+    return 0;
   }
   return -1;
 }
@@ -56,10 +57,10 @@ gpucontext *ctx;
 
 void setup(void) {
   const char *name = NULL;
-  int dev = get_env_dev(&name);
-  if (dev == -1)
-    ck_abort_msg("Bad test device");
-  ctx = gpucontext_init(name, dev, 0, NULL);
+  gpucontext_props *p;
+  ck_assert_int_eq(gpucontext_props_new(&p), GA_NO_ERROR);
+  ck_assert_int_eq(get_env_dev(&name, p), 0);
+  ck_assert_int_eq(gpucontext_init(&ctx, name, p), GA_NO_ERROR);
   ck_assert_ptr_ne(ctx, NULL);
 }
 
diff --git a/tests/main.c b/tests/main.c
index eba94b4f13..1012373521 100644
--- a/tests/main.c
+++ b/tests/main.c
@@ -15,6 +15,10 @@ extern Suite *get_suite(void);
 
 int main(int argc, char *argv[])
 {
+  int number_failed;
+  Suite *s;
+  SRunner *sr;
+
 #ifdef TEST_COLLECTIVES
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &comm_ndev);
@@ -29,9 +33,8 @@ int main(int argc, char *argv[])
   dev_name = argv[comm_rank + 1];  // Set a gpu for this process.
 #endif  // TEST_COLLECTIVES
 
-  int number_failed;
-  Suite *s = get_suite();
-  SRunner *sr = srunner_create(s);
+  s = get_suite();
+  sr = srunner_create(s);
 #ifdef TEST_COLLECTIVES
   // Check by default forks to another (non mpi registered) process in order to
   // run tests. Using MPI inside tests means we must disable this.
diff --git a/versioneer.py b/versioneer.py
new file mode 100644
index 0000000000..e36c724a1d
--- /dev/null
+++ b/versioneer.py
@@ -0,0 +1,1821 @@
+
+# Version: 0.18
+
+"""The Versioneer - like a rocketeer, but for versions.
+
+The Versioneer
+==============
+
+* like a rocketeer, but for versions!
+* https://github.com/warner/python-versioneer
+* Brian Warner
+* License: Public Domain
+* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
+* [![Latest Version]
+(https://pypip.in/version/versioneer/badge.svg?style=flat)
+](https://pypi.python.org/pypi/versioneer/)
+* [![Build Status]
+(https://travis-ci.org/warner/python-versioneer.png?branch=master)
+](https://travis-ci.org/warner/python-versioneer)
+
+This is a tool for managing a recorded version number in distutils-based
+python projects. The goal is to remove the tedious and error-prone "update
+the embedded version string" step from your release process. Making a new
+release should be as easy as recording a new tag in your version-control
+system, and maybe making new tarballs.
+
+
+## Quick Install
+
+* `pip install versioneer` to somewhere to your $PATH
+* add a `[versioneer]` section to your setup.cfg (see below)
+* run `versioneer install` in your source tree, commit the results
+
+## Version Identifiers
+
+Source trees come from a variety of places:
+
+* a version-control system checkout (mostly used by developers)
+* a nightly tarball, produced by build automation
+* a snapshot tarball, produced by a web-based VCS browser, like github's
+  "tarball from tag" feature
+* a release tarball, produced by "setup.py sdist", distributed through PyPI
+
+Within each source tree, the version identifier (either a string or a number,
+this tool is format-agnostic) can come from a variety of places:
+
+* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
+  about recent "tags" and an absolute revision-id
+* the name of the directory into which the tarball was unpacked
+* an expanded VCS keyword ($Id$, etc)
+* a `_version.py` created by some earlier build step
+
+For released software, the version identifier is closely related to a VCS
+tag. Some projects use tag names that include more than just the version
+string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
+needs to strip the tag prefix to extract the version identifier. For
+unreleased software (between tags), the version identifier should provide
+enough information to help developers recreate the same tree, while also
+giving them an idea of roughly how old the tree is (after version 1.2, before
+version 1.3). Many VCS systems can report a description that captures this,
+for example `git describe --tags --dirty --always` reports things like
+"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
+0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
+uncommitted changes.
+
+The version identifier is used for multiple purposes:
+
+* to allow the module to self-identify its version: `myproject.__version__`
+* to choose a name and prefix for a 'setup.py sdist' tarball
+
+## Theory of Operation
+
+Versioneer works by adding a special `_version.py` file into your source
+tree, where your `__init__.py` can import it. This `_version.py` knows how to
+dynamically ask the VCS tool for version information at import time.
+
+`_version.py` also contains `$Revision$` markers, and the installation
+process marks `_version.py` to have this marker rewritten with a tag name
+during the `git archive` command. As a result, generated tarballs will
+contain enough information to get the proper version.
+
+To allow `setup.py` to compute a version too, a `versioneer.py` is added to
+the top level of your source tree, next to `setup.py` and the `setup.cfg`
+that configures it. This overrides several distutils/setuptools commands to
+compute the version when invoked, and changes `setup.py build` and `setup.py
+sdist` to replace `_version.py` with a small static file that contains just
+the generated version data.
+
+## Installation
+
+See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
+
+## Version-String Flavors
+
+Code which uses Versioneer can learn about its version string at runtime by
+importing `_version` from your main `__init__.py` file and running the
+`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
+import the top-level `versioneer.py` and run `get_versions()`.
+
+Both functions return a dictionary with different flavors of version
+information:
+
+* `['version']`: A condensed version string, rendered using the selected
+  style. This is the most commonly used value for the project's version
+  string. The default "pep440" style yields strings like `0.11`,
+  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
+  below for alternative styles.
+
+* `['full-revisionid']`: detailed revision identifier. For Git, this is the
+  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
+
+* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
+  commit date in ISO 8601 format. This will be None if the date is not
+  available.
+
+* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
+  this is only accurate if run in a VCS checkout, otherwise it is likely to
+  be False or None
+
+* `['error']`: if the version string could not be computed, this will be set
+  to a string describing the problem, otherwise it will be None. It may be
+  useful to throw an exception in setup.py if this is set, to avoid e.g.
+  creating tarballs with a version string of "unknown".
+
+Some variants are more useful than others. Including `full-revisionid` in a
+bug report should allow developers to reconstruct the exact code being tested
+(or indicate the presence of local changes that should be shared with the
+developers). `version` is suitable for display in an "about" box or a CLI
+`--version` output: it can be easily compared against release notes and lists
+of bugs fixed in various releases.
+
+The installer adds the following text to your `__init__.py` to place a basic
+version in `YOURPROJECT.__version__`:
+
+    from ._version import get_versions
+    __version__ = get_versions()['version']
+    del get_versions
+
+## Styles
+
+The setup.cfg `style=` configuration controls how the VCS information is
+rendered into a version string.
+
+The default style, "pep440", produces a PEP440-compliant string, equal to the
+un-prefixed tag name for actual releases, and containing an additional "local
+version" section with more detail for in-between builds. For Git, this is
+TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
+--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
+tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
+that this commit is two revisions ("+2") beyond the "0.11" tag. For released
+software (exactly equal to a known tag), the identifier will only contain the
+stripped tag, e.g. "0.11".
+
+Other styles are available. See [details.md](details.md) in the Versioneer
+source tree for descriptions.
+
+## Debugging
+
+Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
+to return a version of "0+unknown". To investigate the problem, run `setup.py
+version`, which will run the version-lookup code in a verbose mode, and will
+display the full contents of `get_versions()` (including the `error` string,
+which may help identify what went wrong).
+
+## Known Limitations
+
+Some situations are known to cause problems for Versioneer. This details the
+most significant ones. More can be found on Github
+[issues page](https://github.com/warner/python-versioneer/issues).
+
+### Subprojects
+
+Versioneer has limited support for source trees in which `setup.py` is not in
+the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
+two common reasons why `setup.py` might not be in the root:
+
+* Source trees which contain multiple subprojects, such as
+  [Buildbot](https://github.com/buildbot/buildbot), which contains both
+  "master" and "slave" subprojects, each with their own `setup.py`,
+  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
+  distributions (and upload multiple independently-installable tarballs).
+* Source trees whose main purpose is to contain a C library, but which also
+  provide bindings to Python (and perhaps other langauges) in subdirectories.
+
+Versioneer will look for `.git` in parent directories, and most operations
+should get the right version string. However `pip` and `setuptools` have bugs
+and implementation details which frequently cause `pip install .` from a
+subproject directory to fail to find a correct version string (so it usually
+defaults to `0+unknown`).
+
+`pip install --editable .` should work correctly. `setup.py install` might
+work too.
+
+Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
+some later version.
+
+[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
+this issue. The discussion in
+[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
+issue from the Versioneer side in more detail.
+[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
+[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
+pip to let Versioneer work correctly.
+
+Versioneer-0.16 and earlier only looked for a `.git` directory next to the
+`setup.cfg`, so subprojects were completely unsupported with those releases.
+
+### Editable installs with setuptools <= 18.5
+
+`setup.py develop` and `pip install --editable .` allow you to install a
+project into a virtualenv once, then continue editing the source code (and
+test) without re-installing after every change.
+
+"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
+convenient way to specify executable scripts that should be installed along
+with the python package.
+
+These both work as expected when using modern setuptools. When using
+setuptools-18.5 or earlier, however, certain operations will cause
+`pkg_resources.DistributionNotFound` errors when running the entrypoint
+script, which must be resolved by re-installing the package. This happens
+when the install happens with one version, then the egg_info data is
+regenerated while a different version is checked out. Many setup.py commands
+cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
+a different virtualenv), so this can be surprising.
+
+[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
+this one, but upgrading to a newer version of setuptools should probably
+resolve it.
+
+### Unicode version strings
+
+While Versioneer works (and is continually tested) with both Python 2 and
+Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
+Newer releases probably generate unicode version strings on py2. It's not
+clear that this is wrong, but it may be surprising for applications when then
+write these strings to a network connection or include them in bytes-oriented
+APIs like cryptographic checksums.
+
+[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
+this question.
+
+
+## Updating Versioneer
+
+To upgrade your project to a new release of Versioneer, do the following:
+
+* install the new Versioneer (`pip install -U versioneer` or equivalent)
+* edit `setup.cfg`, if necessary, to include any new configuration settings
+  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
+* re-run `versioneer install` in your source tree, to replace
+  `SRC/_version.py`
+* commit any changed files
+
+## Future Directions
+
+This tool is designed to make it easily extended to other version-control
+systems: all VCS-specific components are in separate directories like
+src/git/ . The top-level `versioneer.py` script is assembled from these
+components by running make-versioneer.py . In the future, make-versioneer.py
+will take a VCS name as an argument, and will construct a version of
+`versioneer.py` that is specific to the given VCS. It might also take the
+configuration arguments that are currently provided manually during
+installation by editing setup.py . Alternatively, it might go the other
+direction and include code from all supported VCS systems, reducing the
+number of intermediate scripts.
+
+
+## License
+
+To make Versioneer easier to embed, all its code is dedicated to the public
+domain. The `_version.py` that it creates is also in the public domain.
+Specifically, both are released under the Creative Commons "Public Domain
+Dedication" license (CC0-1.0), as described in
+https://creativecommons.org/publicdomain/zero/1.0/ .
+
+"""
+
+from __future__ import print_function
+try:
+    import configparser
+except ImportError:
+    import ConfigParser as configparser
+import errno
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_root():
+    """Get the project root directory.
+
+    We require that all commands are run from the project root, i.e. the
+    directory that contains setup.py, setup.cfg, and versioneer.py .
+    """
+    root = os.path.realpath(os.path.abspath(os.getcwd()))
+    setup_py = os.path.join(root, "setup.py")
+    versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        # allow 'python path/to/setup.py COMMAND'
+        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
+        setup_py = os.path.join(root, "setup.py")
+        versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        err = ("Versioneer was unable to run the project root directory. "
+               "Versioneer requires setup.py to be executed from "
+               "its immediate directory (like 'python setup.py COMMAND'), "
+               "or in a way that lets it use sys.argv[0] to find the root "
+               "(like 'python path/to/setup.py COMMAND').")
+        raise VersioneerBadRootError(err)
+    try:
+        # Certain runtime workflows (setup.py install/develop in a setuptools
+        # tree) execute all dependencies in a single python process, so
+        # "versioneer" may be imported multiple times, and python's shared
+        # module-import table will cache the first one. So we can't use
+        # os.path.dirname(__file__), as that will find whichever
+        # versioneer.py was first imported, even in later projects.
+        me = os.path.realpath(os.path.abspath(__file__))
+        me_dir = os.path.normcase(os.path.splitext(me)[0])
+        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
+        if me_dir != vsr_dir:
+            print("Warning: build in %s is using versioneer.py from %s"
+                  % (os.path.dirname(me), versioneer_py))
+    except NameError:
+        pass
+    return root
+
+
+def get_config_from_root(root):
+    """Read the project setup.cfg file to determine Versioneer config."""
+    # This might raise EnvironmentError (if setup.cfg is missing), or
+    # configparser.NoSectionError (if it lacks a [versioneer] section), or
+    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
+    # the top of versioneer.py for instructions on writing your setup.cfg .
+    setup_cfg = os.path.join(root, "setup.cfg")
+    parser = configparser.SafeConfigParser()
+    with open(setup_cfg, "r") as f:
+        parser.readfp(f)
+    VCS = parser.get("versioneer", "VCS")  # mandatory
+
+    def get(parser, name):
+        if parser.has_option("versioneer", name):
+            return parser.get("versioneer", name)
+        return None
+    cfg = VersioneerConfig()
+    cfg.VCS = VCS
+    cfg.style = get(parser, "style") or ""
+    cfg.versionfile_source = get(parser, "versionfile_source")
+    cfg.versionfile_build = get(parser, "versionfile_build")
+    cfg.tag_prefix = get(parser, "tag_prefix")
+    if cfg.tag_prefix in ("''", '""'):
+        cfg.tag_prefix = ""
+    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
+    cfg.verbose = get(parser, "verbose")
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+# these dictionaries contain VCS-specific tools
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+LONG_VERSION_PY['git'] = '''
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
+    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
+    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "%(STYLE)s"
+    cfg.tag_prefix = "%(TAG_PREFIX)s"
+    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
+    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %%s" %% dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %%s" %% (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %%s (error)" %% dispcmd)
+            print("stdout was %%s" %% stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %%s but none started with prefix %%s" %%
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %%d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%%s', no digits" %% ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %%s" %% ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %%s" %% r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %%s not under git control" %% root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%%s*" %% tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%%s'"
+                               %% describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                print(fmt %% (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+                               %% (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%%d" %% pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%%d" %% pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%%s" %% pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%%s" %% pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%%s'" %% style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
+'''
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def do_vcs_install(manifest_in, versionfile_source, ipy):
+    """Git-specific installation logic for Versioneer.
+
+    For Git, this means creating/changing .gitattributes to mark _version.py
+    for export-subst keyword substitution.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    files = [manifest_in, versionfile_source]
+    if ipy:
+        files.append(ipy)
+    try:
+        me = __file__
+        if me.endswith(".pyc") or me.endswith(".pyo"):
+            me = os.path.splitext(me)[0] + ".py"
+        versioneer_file = os.path.relpath(me)
+    except NameError:
+        versioneer_file = "versioneer.py"
+    files.append(versioneer_file)
+    present = False
+    try:
+        f = open(".gitattributes", "r")
+        for line in f.readlines():
+            if line.strip().startswith(versionfile_source):
+                if "export-subst" in line.strip().split()[1:]:
+                    present = True
+        f.close()
+    except EnvironmentError:
+        pass
+    if not present:
+        f = open(".gitattributes", "a+")
+        f.write("%s export-subst\n" % versionfile_source)
+        f.close()
+        files.append(".gitattributes")
+    run_command(GITS, ["add", "--"] + files)
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+SHORT_VERSION_PY = """
+# This file was generated by 'versioneer.py' (0.18) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+import json
+
+version_json = '''
+%s
+'''  # END VERSION_JSON
+
+
+def get_versions():
+    return json.loads(version_json)
+"""
+
+
+def versions_from_file(filename):
+    """Try to determine the version from _version.py if present."""
+    try:
+        with open(filename) as f:
+            contents = f.read()
+    except EnvironmentError:
+        raise NotThisMethod("unable to read _version.py")
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+                   contents, re.M | re.S)
+    if not mo:
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+                       contents, re.M | re.S)
+    if not mo:
+        raise NotThisMethod("no version_json in _version.py")
+    return json.loads(mo.group(1))
+
+
+def write_to_version_file(filename, versions):
+    """Write the given version number to the given _version.py file."""
+    os.unlink(filename)
+    contents = json.dumps(versions, sort_keys=True,
+                          indent=1, separators=(",", ": "))
+    with open(filename, "w") as f:
+        f.write(SHORT_VERSION_PY % contents)
+
+    print("set %s to '%s'" % (filename, versions["version"]))
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+class VersioneerBadRootError(Exception):
+    """The project root directory is unknown or missing key files."""
+
+
+def get_versions(verbose=False):
+    """Get the project version from whatever source is available.
+
+    Returns dict with two keys: 'version' and 'full'.
+    """
+    if "versioneer" in sys.modules:
+        # see the discussion in cmdclass.py:get_cmdclass()
+        del sys.modules["versioneer"]
+
+    root = get_root()
+    cfg = get_config_from_root(root)
+
+    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
+    handlers = HANDLERS.get(cfg.VCS)
+    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
+    verbose = verbose or cfg.verbose
+    assert cfg.versionfile_source is not None, \
+        "please set versioneer.versionfile_source"
+    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
+
+    versionfile_abs = os.path.join(root, cfg.versionfile_source)
+
+    # extract version from first of: _version.py, VCS command (e.g. 'git
+    # describe'), parentdir. This is meant to work for developers using a
+    # source checkout, for users of a tarball created by 'setup.py sdist',
+    # and for users of a tarball/zipball created by 'git archive' or github's
+    # download-from-tag feature or the equivalent in other VCSes.
+
+    get_keywords_f = handlers.get("get_keywords")
+    from_keywords_f = handlers.get("keywords")
+    if get_keywords_f and from_keywords_f:
+        try:
+            keywords = get_keywords_f(versionfile_abs)
+            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
+            if verbose:
+                print("got version from expanded keyword %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        ver = versions_from_file(versionfile_abs)
+        if verbose:
+            print("got version from file %s %s" % (versionfile_abs, ver))
+        return ver
+    except NotThisMethod:
+        pass
+
+    from_vcs_f = handlers.get("pieces_from_vcs")
+    if from_vcs_f:
+        try:
+            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
+            ver = render(pieces, cfg.style)
+            if verbose:
+                print("got version from VCS %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        if cfg.parentdir_prefix:
+            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+            if verbose:
+                print("got version from parentdir %s" % ver)
+            return ver
+    except NotThisMethod:
+        pass
+
+    if verbose:
+        print("unable to compute version")
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None, "error": "unable to compute version",
+            "date": None}
+
+
+def get_version():
+    """Get the short version string for this project."""
+    return get_versions()["version"]
+
+
+def get_cmdclass():
+    """Get the custom setuptools/distutils subclasses used by Versioneer."""
+    if "versioneer" in sys.modules:
+        del sys.modules["versioneer"]
+        # this fixes the "python setup.py develop" case (also 'install' and
+        # 'easy_install .'), in which subdependencies of the main project are
+        # built (using setup.py bdist_egg) in the same python process. Assume
+        # a main project A and a dependency B, which use different versions
+        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
+        # sys.modules by the time B's setup.py is executed, causing B to run
+        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
+        # sandbox that restores sys.modules to it's pre-build state, so the
+        # parent is protected against the child's "import versioneer". By
+        # removing ourselves from sys.modules here, before the child build
+        # happens, we protect the child from the parent's versioneer too.
+        # Also see https://github.com/warner/python-versioneer/issues/52
+
+    cmds = {}
+
+    # we add "version" to both distutils and setuptools
+    from distutils.core import Command
+
+    class cmd_version(Command):
+        description = "report generated version string"
+        user_options = []
+        boolean_options = []
+
+        def initialize_options(self):
+            pass
+
+        def finalize_options(self):
+            pass
+
+        def run(self):
+            vers = get_versions(verbose=True)
+            print("Version: %s" % vers["version"])
+            print(" full-revisionid: %s" % vers.get("full-revisionid"))
+            print(" dirty: %s" % vers.get("dirty"))
+            print(" date: %s" % vers.get("date"))
+            if vers["error"]:
+                print(" error: %s" % vers["error"])
+    cmds["version"] = cmd_version
+
+    # we override "build_py" in both distutils and setuptools
+    #
+    # most invocation pathways end up running build_py:
+    #  distutils/build -> build_py
+    #  distutils/install -> distutils/build ->..
+    #  setuptools/bdist_wheel -> distutils/install ->..
+    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
+    #  setuptools/install -> bdist_egg ->..
+    #  setuptools/develop -> ?
+    #  pip install:
+    #   copies source tree to a tempdir before running egg_info/etc
+    #   if .git isn't copied too, 'git describe' will fail
+    #   then does setup.py bdist_wheel, or sometimes setup.py install
+    #  setup.py egg_info -> ?
+
+    # we override different "build_py" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.build_py import build_py as _build_py
+    else:
+        from distutils.command.build_py import build_py as _build_py
+
+    class cmd_build_py(_build_py):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_py.run(self)
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            if cfg.versionfile_build:
+                target_versionfile = os.path.join(self.build_lib,
+                                                  cfg.versionfile_build)
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+    cmds["build_py"] = cmd_build_py
+
+    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
+        from cx_Freeze.dist import build_exe as _build_exe
+        # nczeczulin reports that py2exe won't like the pep440-style string
+        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
+        # setup(console=[{
+        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
+        #   "product_version": versioneer.get_version(),
+        #   ...
+
+        class cmd_build_exe(_build_exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _build_exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["build_exe"] = cmd_build_exe
+        del cmds["build_py"]
+
+    if 'py2exe' in sys.modules:  # py2exe enabled?
+        try:
+            from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
+        except ImportError:
+            from py2exe.build_exe import py2exe as _py2exe  # py2
+
+        class cmd_py2exe(_py2exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _py2exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["py2exe"] = cmd_py2exe
+
+    # we override different "sdist" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.sdist import sdist as _sdist
+    else:
+        from distutils.command.sdist import sdist as _sdist
+
+    class cmd_sdist(_sdist):
+        def run(self):
+            versions = get_versions()
+            self._versioneer_generated_versions = versions
+            # unless we update this, the command will keep using the old
+            # version
+            self.distribution.metadata.version = versions["version"]
+            return _sdist.run(self)
+
+        def make_release_tree(self, base_dir, files):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            _sdist.make_release_tree(self, base_dir, files)
+            # now locate _version.py in the new base_dir directory
+            # (remembering that it may be a hardlink) and replace it with an
+            # updated value
+            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile,
+                                  self._versioneer_generated_versions)
+    cmds["sdist"] = cmd_sdist
+    return cmds
+
+
+CONFIG_ERROR = """
+setup.cfg is missing the necessary Versioneer configuration. You need
+a section like:
+
+ [versioneer]
+ VCS = git
+ style = pep440
+ versionfile_source = src/myproject/_version.py
+ versionfile_build = myproject/_version.py
+ tag_prefix =
+ parentdir_prefix = myproject-
+
+You will also need to edit your setup.py to use the results:
+
+ import versioneer
+ setup(version=versioneer.get_version(),
+       cmdclass=versioneer.get_cmdclass(), ...)
+
+Please read the docstring in ./versioneer.py for configuration instructions,
+edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
+"""
+
+SAMPLE_CONFIG = """
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+#VCS = git
+#style = pep440
+#versionfile_source =
+#versionfile_build =
+#tag_prefix =
+#parentdir_prefix =
+
+"""
+
+INIT_PY_SNIPPET = """
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+"""
+
+
+def do_setup():
+    """Main VCS-independent setup function for installing Versioneer."""
+    root = get_root()
+    try:
+        cfg = get_config_from_root(root)
+    except (EnvironmentError, configparser.NoSectionError,
+            configparser.NoOptionError) as e:
+        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
+            print("Adding sample versioneer config to setup.cfg",
+                  file=sys.stderr)
+            with open(os.path.join(root, "setup.cfg"), "a") as f:
+                f.write(SAMPLE_CONFIG)
+        print(CONFIG_ERROR, file=sys.stderr)
+        return 1
+
+    print(" creating %s" % cfg.versionfile_source)
+    with open(cfg.versionfile_source, "w") as f:
+        LONG = LONG_VERSION_PY[cfg.VCS]
+        f.write(LONG % {"DOLLAR": "$",
+                        "STYLE": cfg.style,
+                        "TAG_PREFIX": cfg.tag_prefix,
+                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        })
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
+                       "__init__.py")
+    if os.path.exists(ipy):
+        try:
+            with open(ipy, "r") as f:
+                old = f.read()
+        except EnvironmentError:
+            old = ""
+        if INIT_PY_SNIPPET not in old:
+            print(" appending to %s" % ipy)
+            with open(ipy, "a") as f:
+                f.write(INIT_PY_SNIPPET)
+        else:
+            print(" %s unmodified" % ipy)
+    else:
+        print(" %s doesn't exist, ok" % ipy)
+        ipy = None
+
+    # Make sure both the top-level "versioneer.py" and versionfile_source
+    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
+    # they'll be copied into source distributions. Pip won't be able to
+    # install the package without this.
+    manifest_in = os.path.join(root, "MANIFEST.in")
+    simple_includes = set()
+    try:
+        with open(manifest_in, "r") as f:
+            for line in f:
+                if line.startswith("include "):
+                    for include in line.split()[1:]:
+                        simple_includes.add(include)
+    except EnvironmentError:
+        pass
+    # That doesn't cover everything MANIFEST.in can do
+    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
+    # it might give some false negatives. Appending redundant 'include'
+    # lines is safe, though.
+    if "versioneer.py" not in simple_includes:
+        print(" appending 'versioneer.py' to MANIFEST.in")
+        with open(manifest_in, "a") as f:
+            f.write("include versioneer.py\n")
+    else:
+        print(" 'versioneer.py' already in MANIFEST.in")
+    if cfg.versionfile_source not in simple_includes:
+        print(" appending versionfile_source ('%s') to MANIFEST.in" %
+              cfg.versionfile_source)
+        with open(manifest_in, "a") as f:
+            f.write("include %s\n" % cfg.versionfile_source)
+    else:
+        print(" versionfile_source already in MANIFEST.in")
+
+    # Make VCS-specific changes. For git, this means creating/changing
+    # .gitattributes to mark _version.py for export-subst keyword
+    # substitution.
+    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
+    return 0
+
+
+def scan_setup_py():
+    """Validate the contents of setup.py against Versioneer's expectations."""
+    found = set()
+    setters = False
+    errors = 0
+    with open("setup.py", "r") as f:
+        for line in f.readlines():
+            if "import versioneer" in line:
+                found.add("import")
+            if "versioneer.get_cmdclass()" in line:
+                found.add("cmdclass")
+            if "versioneer.get_version()" in line:
+                found.add("get_version")
+            if "versioneer.VCS" in line:
+                setters = True
+            if "versioneer.versionfile_source" in line:
+                setters = True
+    if len(found) != 3:
+        print("")
+        print("Your setup.py appears to be missing some important items")
+        print("(but I might be wrong). Please make sure it has something")
+        print("roughly like the following:")
+        print("")
+        print(" import versioneer")
+        print(" setup( version=versioneer.get_version(),")
+        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
+        print("")
+        errors += 1
+    if setters:
+        print("You should remove lines like 'versioneer.VCS = ' and")
+        print("'versioneer.versionfile_source = ' . This configuration")
+        print("now lives in setup.cfg, and should be removed from setup.py")
+        print("")
+        errors += 1
+    return errors
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "setup":
+        errors = do_setup()
+        errors += scan_setup_py()
+        if errors:
+            sys.exit(1)