From 92357d1691a776d3c9887c34f83ce8efcb150ff6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 9 Aug 2016 14:14:14 -0400 Subject: [PATCH 001/597] Add a check for a recent addtion in libcheck to filter out old versions that can cause link failures in the tests. --- tests/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 62e27f58bf..207db2f7aa 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -15,6 +15,14 @@ if(NOT CHECK_FOUND) endif() +if(CHECK_FOUND) + set(CMAKE_REQUIRED_INCLUDE ${CHECK_INCLUDE_DIRS}) + CHECK_SYMBOL_EXISTS(ck_assert_ptr_ne "check.h" CHECK_FUNCS) + if (NOT CHECK_FUNCS) + set(CHECK_FOUND 0) + endif() +endif() + if(CHECK_FOUND) enable_testing() From 466e22c6efd3e4567b4d816a120bdd71c36943dd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 9 Aug 2016 18:33:56 -0400 Subject: [PATCH 002/597] Add extensions for ipc memory handles. --- src/gpuarray_buffer_cuda.c | 29 +++++++++++++++++++++++++++++ src/gpuarray_extension.c | 4 ++++ src/private_cuda.h | 5 +++-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index cfbc1e672f..14c68ee926 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -554,6 +554,32 @@ static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags, return res; } +CUipcMemHandle cuda_get_ipc_handle(gpudata *d) { + CUipcMemHandle h = NULL; + + ASSERT_BUF(d); + cuda_enter(d->ctx); + d->ctx->err = cuIpcGetMemHandle(&h, d->ptr); + cuda_exit(d->ctx); + return h; +} + +gpudata *cuda_open_ipc_handle(gpucontext *c, CUipcMemHandle h, size_t sz) { + CUdeviceptr p; + cuda_context *ctx = (cuda_context *)c; + gpudata *d = NULL; + + cuda_enter(ctx); + ctx->err = cuIpcOpenMemHandle(&p, h, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + if (ctx->err == CUDA_SUCCESS) { + d = cuda_make_buf(ctx, p, sz); + if (d != NULL) + d->flags |= CUDA_IPC_MEMORY; + } + cuda_exit(ctx); + return d; +} + static void cuda_retain(gpudata *d) { ASSERT_BUF(d); d->refcnt++; @@ -579,6 +605,9 @@ static void cuda_free(gpudata *d) { if (d->flags & DONTFREE) { /* This is the path for "external" buffers */ deallocate(d); + } else if (d->flags & CUDA_IPC_MEMORY) { + cuIpcCloseMemHandle(d->ptr); + deallocate(d); } else if (ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE) { /* Just free the pointer */ cuMemFree(d->ptr); diff --git a/src/gpuarray_extension.c b/src/gpuarray_extension.c index 73d63ab656..1904c3a8f4 100644 --- a/src/gpuarray_extension.c +++ b/src/gpuarray_extension.c @@ -16,6 +16,8 @@ extern void *cuda_make_buf(void); extern void *cuda_get_sz(void); extern void *cuda_wait(void); extern void *cuda_record(void); +extern void *cuda_get_ipc_handle(void); +extern void *cuda_open_ipc_handle(void); #endif #ifdef WITH_OPENCL extern void *cl_make_ctx(void); @@ -34,6 +36,8 @@ static ext ext_list[] = { {"cuda_get_sz", cuda_get_sz}, {"cuda_wait", cuda_wait}, {"cuda_record", cuda_record}, + {"cuda_get_ipc_handle", cuda_get_ipc_handle}, + {"cuda_open_ipc_handle", cuda_open_ipc_handle}, #endif #ifdef WITH_OPENCL {"cl_make_ctx", cl_make_ctx}, diff --git a/src/private_cuda.h b/src/private_cuda.h index 642a9991a4..79990b4dfd 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -135,8 +135,9 @@ GPUARRAY_LOCAL int cuda_record(gpudata *, int); #define CUDA_WAIT_ALL (CUDA_WAIT_READ|CUDA_WAIT_WRITE) -#define CUDA_HEAD_ALLOC 0x40000 -#define CUDA_MAPPED_PTR 0x80000 +#define CUDA_IPC_MEMORY 0x100000 +#define CUDA_HEAD_ALLOC 0x200000 +#define CUDA_MAPPED_PTR 0x400000 struct _gpukernel { cuda_context *ctx; /* Keep the context first */ From c68375622fddf64279c7bda37bd21d958e93ee14 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 9 Aug 2016 18:34:18 -0400 Subject: [PATCH 003/597] Update ext_cuda.h. --- src/gpuarray/ext_cuda.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h index 2d6a9814cd..689d0aa60a 100644 --- a/src/gpuarray/ext_cuda.h +++ b/src/gpuarray/ext_cuda.h @@ -16,10 +16,12 @@ static void (*cuda_exit)(gpucontext *); static gpucontext *(*cuda_make_ctx)(CUcontext, int); static CUstream (*cuda_get_stream)(void *); static gpudata *(*cuda_make_buf)(void *, CUdeviceptr, size_t); -static CUdeviceptr (*cuda_get_ptr)(gpudata *); static size_t (*cuda_get_sz)(gpudata *); static int (*cuda_wait)(gpudata *, int); static int (*cuda_record)(gpudata *, int); +static CUipcMemHandle (*cuda_get_ipc_handle)(gpudata *d); +static gpudata *(*cuda_open_ipc_handle)(gpucontext *c, CUipcMemHandle h, + size_t sz); static void setup_ext_cuda(void) { // The casts are necessary to reassure C++ compilers @@ -28,10 +30,11 @@ static void setup_ext_cuda(void) { cuda_make_ctx = (gpucontext *(*)(CUcontext, int))gpuarray_get_extension("cuda_make_ctx"); cuda_get_stream = (CUstream (*)(void *))gpuarray_get_extension("cuda_get_stream"); cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf"); - cuda_get_ptr = (CUdeviceptr (*)(gpudata *))gpuarray_get_extension("cuda_get_ptr"); cuda_get_sz = (size_t (*)(gpudata *))gpuarray_get_extension("cuda_get_sz"); cuda_wait = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_wait"); cuda_record = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_record"); + cuda_get_ipc_handle = (CUipcMemHandle (*)(gpudata *))gpuarray_get_extension("cuda_get_ipc_handle"); + cuda_open_ipc_handle = (gpudata *(*)(gpucontext *c, CUipcMemHandle h, size_t sz))gpuarray_get_extension("cuda_open_ipc_handle"); } #ifdef __cplusplus From 72b26fd014a87843e3d3307e774401db49a12fff Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 10 Aug 2016 12:39:03 -0400 Subject: [PATCH 004/597] Make a fake type to avoid dependecy on cuda.h when using the IPC api. --- src/gpuarray/extension.h | 4 ++++ src/gpuarray_buffer_cuda.c | 23 +++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/gpuarray/extension.h b/src/gpuarray/extension.h index b26b5231e5..ec92098c3f 100644 --- a/src/gpuarray/extension.h +++ b/src/gpuarray/extension.h @@ -19,6 +19,10 @@ extern "C" { #define GPUARRAY_CUDA_WAIT_READ 0x10000 /* CUDA_WAIT_READ */ #define GPUARRAY_CUDA_WAIT_WRITE 0x20000 /* CUDA_WAIT_WRITE */ +typedef struct _GpuArrayIpcMemHandle { + char private[64]; +} GpuArrayIpcMemHandle; + /** * Obtain a function pointer for an extension. * diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 14c68ee926..29209ab952 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -16,9 +16,15 @@ #include "gpuarray/buffer.h" #include "gpuarray/util.h" #include "gpuarray/error.h" -#include "gpuarray/extension.h" #include "gpuarray/buffer_blas.h" +#include "gpuarray/extension.h" + +STATIC_ASSERT(DONTFREE == GPUARRAY_CUDA_CTX_NOFREE, cuda_nofree_eq); +STATIC_ASSERT(CUDA_WAIT_READ == GPUARRAY_CUDA_WAIT_READ, cuda_wait_read_eq); +STATIC_ASSERT(CUDA_WAIT_WRITE == GPUARRAY_CUDA_WAIT_WRITE, cuda_wait_write_eq); +STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcmem_eq); + /* Allocations will be made in blocks of at least this size */ #define BLOCK_SIZE (4 * 1024 * 1024) @@ -554,23 +560,28 @@ static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags, return res; } -CUipcMemHandle cuda_get_ipc_handle(gpudata *d) { - CUipcMemHandle h = NULL; +GpuArrayIpcMemHandle cuda_get_ipc_handle(gpudata *d) { + CUipcMemHandle h = {{0}}; ASSERT_BUF(d); cuda_enter(d->ctx); d->ctx->err = cuIpcGetMemHandle(&h, d->ptr); cuda_exit(d->ctx); - return h; + /* You need to do this stupid dance because direct casting of + * structures is not allowed */ + return *((GpuArrayIpcMemHandle *)&h); } -gpudata *cuda_open_ipc_handle(gpucontext *c, CUipcMemHandle h, size_t sz) { +gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle h, size_t sz) { CUdeviceptr p; cuda_context *ctx = (cuda_context *)c; gpudata *d = NULL; cuda_enter(ctx); - ctx->err = cuIpcOpenMemHandle(&p, h, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + /* You need to do this stupid dance because direct casting of + * structures is not allowed */ + ctx->err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)&h), + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); if (ctx->err == CUDA_SUCCESS) { d = cuda_make_buf(ctx, p, sz); if (d != NULL) From ca1761d15ec25606da7055d8514aadf768b848bf Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 10 Aug 2016 16:01:45 -0400 Subject: [PATCH 005/597] Add python interface for ipc memory handles. --- pygpu/gpuarray.pxd | 3 +++ pygpu/gpuarray.pyx | 35 +++++++++++++++++++++++++++++++++++ src/gpuarray_buffer_cuda.c | 17 ++++++----------- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 1fcc5a0068..dd09afcbb2 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -196,6 +196,9 @@ cdef extern from "gpuarray/array.h": cdef extern from "gpuarray/extension.h": void *gpuarray_get_extension(const char *) + ctypedef struct GpuArrayIpcMemHandle: + pass + cdef int GPUARRAY_CUDA_CTX_NOFREE cdef type get_exc(int errcode) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 42f6f6d3b5..7a5098ad50 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1424,6 +1424,28 @@ def _concatenate(list al, unsigned int axis, int restype, object cls, finally: PyMem_Free(als) +cdef int (*cuda_get_ipc_handle)(gpudata *, GpuArrayIpcMemHandle *) +cdef gpudata *(*cuda_open_ipc_handle)(gpucontext *, GpuArrayIpcMemHandle *, size_t) + +cuda_get_ipc_handle = gpuarray_get_extension("cuda_get_ipc_handle") +cuda_open_ipc_handle = gpuarray_get_extension("cuda_open_ipc_handle") + +def open_ipc_handle(GpuContext c, bytes hpy, size_t l): + """ + Open an IPC handle to get a new GpuArray from it. + """ + cdef char *b + cdef GpuArrayIpcMemHandle h + cdef gpudata *d + + b = hpy + memcpy(&h, b, sizeof(h)) + + d = cuda_open_ipc_handle(c.ctx, &h, l) + if d is NULL: + raise GpuArrayException, "could not open handle" + return d + cdef class GpuArray: """ Device array @@ -1561,6 +1583,19 @@ cdef class GpuArray: raise ValueError, "GpuArray and Numpy array do not have the same size in bytes" array_read(np.PyArray_DATA(dst), sz, self) + def get_ipc_handle(self): + cdef GpuArrayIpcMemHandle h + cdef int err + if cuda_get_ipc_handle is NULL: + raise SystemError, "Could not get necessary extension" + if self.context.kind == 'cuda': + raise ValueError, "Only works for cuda contexts" + err = cuda_get_ipc_handle(self.ga.data, &h) + if err != GA_NO_ERROR: + raise get_exc(err), GpuArray_error(&self.ga, err) + res = (&h)[:sizeof(h)] + return res + def __array__(self): """ __array__() diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 29209ab952..67b931c894 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -560,27 +560,22 @@ static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags, return res; } -GpuArrayIpcMemHandle cuda_get_ipc_handle(gpudata *d) { - CUipcMemHandle h = {{0}}; - +int cuda_get_ipc_handle(gpudata *d, GpuArrayIpcMemHandle *h) { ASSERT_BUF(d); cuda_enter(d->ctx); - d->ctx->err = cuIpcGetMemHandle(&h, d->ptr); + CUDA_EXIT_ON_ERROR(d->ctx, + cuIpcGetMemHandle((CUipcMemHandle *)h, d->ptr)); cuda_exit(d->ctx); - /* You need to do this stupid dance because direct casting of - * structures is not allowed */ - return *((GpuArrayIpcMemHandle *)&h); + return GA_NO_ERROR; } -gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle h, size_t sz) { +gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle *h, size_t sz) { CUdeviceptr p; cuda_context *ctx = (cuda_context *)c; gpudata *d = NULL; cuda_enter(ctx); - /* You need to do this stupid dance because direct casting of - * structures is not allowed */ - ctx->err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)&h), + ctx->err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)h), CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); if (ctx->err == CUDA_SUCCESS) { d = cuda_make_buf(ctx, p, sz); From 8de886278b1970f00db9c85ff492d05a08138107 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 11 Aug 2016 15:37:32 -0400 Subject: [PATCH 006/597] Don't use c++ keywords (grrrrr!) --- src/gpuarray/extension.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray/extension.h b/src/gpuarray/extension.h index ec92098c3f..6302cb3e33 100644 --- a/src/gpuarray/extension.h +++ b/src/gpuarray/extension.h @@ -20,7 +20,7 @@ extern "C" { #define GPUARRAY_CUDA_WAIT_WRITE 0x20000 /* CUDA_WAIT_WRITE */ typedef struct _GpuArrayIpcMemHandle { - char private[64]; + char priv[64]; } GpuArrayIpcMemHandle; /** From a67ed82be557176e97efbfe1bd682b23ee63ba6a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 11 Aug 2016 15:53:01 -0400 Subject: [PATCH 007/597] Compare properly against 'cuda' --- pygpu/gpuarray.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 7a5098ad50..e9858996a5 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1588,7 +1588,7 @@ cdef class GpuArray: cdef int err if cuda_get_ipc_handle is NULL: raise SystemError, "Could not get necessary extension" - if self.context.kind == 'cuda': + if self.context.kind != b'cuda': raise ValueError, "Only works for cuda contexts" err = cuda_get_ipc_handle(self.ga.data, &h) if err != GA_NO_ERROR: From d1b4705eecfb49ef546e7bd2074bb6ab9c859616 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 11 Aug 2016 16:01:00 -0400 Subject: [PATCH 008/597] Fix from_gpudata without strides. --- pygpu/gpuarray.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index e9858996a5..194cab35e0 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -839,7 +839,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None, else: size = gpuarray_get_elsize(typecode) for i in range(nd-1, -1, -1): - strides[i] = size + cstrides[i] = size size *= cdims[i] return pygpu_fromgpudata(data, offset, typecode, nd, cdims, From 18e0ef1b750028e9051ab22d93773fb2b13a0c7c Mon Sep 17 00:00:00 2001 From: Christos Tsirigotis Date: Sun, 14 Aug 2016 04:28:08 +0300 Subject: [PATCH 009/597] Add "mul" string as a collective reduce op --- pygpu/collectives.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx index e75049a1c4..d41d4cebeb 100644 --- a/pygpu/collectives.pyx +++ b/pygpu/collectives.pyx @@ -289,6 +289,7 @@ cdef dict TO_RED_OP = { '*': GA_PROD, "prod": GA_PROD, "product": GA_PROD, + "mul": GA_PROD, "max": GA_MAX, "maximum": GA_MAX, "min": GA_MIN, From b63304a548e73bd6a9921d72ba31e3d362fead1e Mon Sep 17 00:00:00 2001 From: Christos Tsirigotis Date: Sun, 14 Aug 2016 04:46:02 +0300 Subject: [PATCH 010/597] Update docs to show collectives --- doc/installation.rst | 8 ++++++++ doc/pyapi.rst | 3 +++ 2 files changed, 11 insertions(+) diff --git a/doc/installation.rst b/doc/installation.rst index 10cfb682f5..3d948aa57d 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -19,6 +19,7 @@ Requirements - cmake >= 3.0 (cmake_). - a c99-compliant compiler (or MSVC if on windows). - (optional) CUDA >= 6.5 (cuda_). + - (optional) NVIDIA NCCL (nccl_). - (optional) OpenCL runtime. - (optional) clBLAS (clblas_). - (optional) libcheck (check_) to run the C tests. @@ -35,6 +36,11 @@ Requirements We support CUDA GPUs with `compute capability 2.0 (Fermi) `_ and up. +.. note:: + In the case you want to build with collective operation support for CUDA, + you will need CUDA GPUs with `compute capability 3.0 (Kepler) + `_ and up plus CUDA >= 7. + Download -------- @@ -211,6 +217,8 @@ you can confirm which device it is running on. .. _cuda: https://developer.nvidia.com/category/zone/cuda-zone +.. _nccl: https://github.com/NVIDIA/nccl + .. _check: http://check.sourceforge.net/ .. _python: http://python.org/ diff --git a/doc/pyapi.rst b/doc/pyapi.rst index 8a5b94837a..8e3f5b3c44 100644 --- a/doc/pyapi.rst +++ b/doc/pyapi.rst @@ -15,3 +15,6 @@ Python module reference .. automodule:: pygpu._array :members: + + .. automodule:: pygpu.collectives + :members: From f3004b59391e766a0c3cb97e37116d0a74c1081f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 15 Aug 2016 12:09:02 -0400 Subject: [PATCH 011/597] Skip the xsplit test when numpy is older than 1.11. --- pygpu/tests/test_operations.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pygpu/tests/test_operations.py b/pygpu/tests/test_operations.py index abf5e65d90..d043a73e96 100644 --- a/pygpu/tests/test_operations.py +++ b/pygpu/tests/test_operations.py @@ -1,7 +1,7 @@ import numpy import pygpu -from .support import (gen_gpuarray, context) +from .support import (gen_gpuarray, context, SkipTest) def test_array_split(): @@ -21,11 +21,15 @@ def test_array_split(): for pc, pg in zip(rc, rg): numpy.testing.assert_allclose(pc, numpy.asarray(pg)) + def test_split(): for spl in (3, [3, 5, 6, 10]): yield xsplit, '', (9,), spl + def test_xsplit(): + if numpy.version.version < '1.11': + raise SkipTest("Numpy version too old") for l in ('h', 'v'): for spl in (2, [3, 6]): yield xsplit, l, (4, 4), spl @@ -33,6 +37,7 @@ def test_xsplit(): for spl in (2, [3, 6]): yield xsplit, 'd', (2, 2, 4), spl + def xsplit(l, shp, spl): xc, xg = gen_gpuarray(shp, 'float32', ctx=context) n = l + 'split' From 1a6d7048c9a84573e600199417021f5825b880d0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 15 Aug 2016 13:03:33 -0400 Subject: [PATCH 012/597] Better check for version. --- pygpu/tests/test_operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/tests/test_operations.py b/pygpu/tests/test_operations.py index d043a73e96..751450f638 100644 --- a/pygpu/tests/test_operations.py +++ b/pygpu/tests/test_operations.py @@ -28,7 +28,7 @@ def test_split(): def test_xsplit(): - if numpy.version.version < '1.11': + if tuple(int(v) for v in numpy.version.version.split('.')[:2]) < (1, 11): raise SkipTest("Numpy version too old") for l in ('h', 'v'): for spl in (2, [3, 6]): From 70afa2e92e00227a3ec321a276ab08865355d176 Mon Sep 17 00:00:00 2001 From: davidweichiang Date: Mon, 29 Aug 2016 15:45:37 -0400 Subject: [PATCH 013/597] make take1 kernel use actual typecode of ind --- src/gpuarray_array.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 73a6f6f4f6..dd29487fae 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -370,8 +370,9 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } - strb_appends(&sb, " GLOBAL_MEM const ga_ssize *ind, ga_size n0, ga_size n1," - " GLOBAL_MEM int* err) {\n"); + strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size n0, ga_size n1," + " GLOBAL_MEM int* err) {\n", + gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; From 27e892c56a95de6eeee60d30d37c7428e0c00761 Mon Sep 17 00:00:00 2001 From: slefrancois Date: Wed, 7 Sep 2016 16:15:09 -0400 Subject: [PATCH 014/597] add jenkins PR test script --- .jenkins.sh | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 .jenkins.sh diff --git a/.jenkins.sh b/.jenkins.sh new file mode 100644 index 0000000000..e9428d4765 --- /dev/null +++ b/.jenkins.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# Script for Jenkins continuous integration testing of libgpuarray + +# Print commands as they are executed +set -x + +# Anaconda python +export PATH=/usr/local/miniconda2/bin:$PATH + +# CUDA +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH + +GPUARRAY=none + +# Set some default values +: ${BUILDBOT_DIR:="$WORKSPACE/nightly_build"} # Jenkins workspace path +# Can also set to "Debug", "Release" to go faster +: ${GPUARRAY_CONFIG:="Release"} +# Set these to " " to disable (empty doesn't work) +: ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1" +: ${DEVICES_OPENCL:=" "} +# Parameters for nosetests +: ${NOSE_PARAM="-v --with-xunit --xunit-file="} + +date +hostname + +mkdir -p ${BUILDBOT_DIR} +cd ${BUILDBOT_DIR} + +# Make fresh clone (with no history since we don't need it) +rm -rf libgpuarray +git clone --depth 1 "https://github.com/Theano/libgpuarray.git" + +(cd libgpuarray && echo "libgpuarray commit" && git rev-parse HEAD) + +# Clean up previous installs (to make sure no old files are left) +rm -rf local +mkdir local + +# Build libgpuarray and run C tests +mkdir libgpuarray/build +(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=${BUILDBOT_DIR}/local && make) + +# Test on different devices +for dev in ${DEVICES_CUDA}; do + echo "Testing libgpuarray for DEVICE=${dev}" + (cd libgpuarray/build && CK_DEFAULT_TIMEOUT=16 DEVICE=${dev} make test) +done +for dev in ${DEVICES_OPENCL}; do + echo "Testing libgpuarray for DEVICE=${dev}" + (cd libgpuarray/build && DEVICE=${dev} make test) +done + +# Finally install +(cd libgpuarray/build && make install) +export LD_LIBRARY_PATH=${BUILDBOT_DIR}/local/lib:${LD_LIBRARY_PATH} +export LIBRARY_PATH=${BUILDBOT_DIR}/local/lib:${LIBRARY_PATH} +export CPATH=${BUILDBOT_DIR}/local/include:${CPATH} + +# Build the pygpu modules +(cd libgpuarray && python setup.py build_ext --inplace -I${BUILDBOT_DIR}/local/include -L${BUILDBOT_DIR}/local/lib) + + +echo -n > ${BUILDBOT_DIR}/pygpu.log +# Test it +for dev in ${DEVICES_CUDA}; do + echo "Testing pygpu for DEVICE=${dev}" + test=${BUILDBOT_DIR}/pygpu + DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml libgpuarray/pygpu/tests +done +for dev in ${DEVICES_OPENCL}; do + echo "Testing pygpu for DEVICE=${dev}" + DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml libgpuarray/pygpu/tests -e test_blas.py +done + +env +date +hostname From 5e160de1233a4e262a531aa240ce2322f158680b Mon Sep 17 00:00:00 2001 From: slefrancois Date: Wed, 7 Sep 2016 16:44:32 -0400 Subject: [PATCH 015/597] chmod +x .jenkins.sh --- .jenkins.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .jenkins.sh diff --git a/.jenkins.sh b/.jenkins.sh old mode 100644 new mode 100755 From 19ce58711d0d567502930bef0a810440e604935e Mon Sep 17 00:00:00 2001 From: slefrancois Date: Wed, 7 Sep 2016 18:01:05 -0400 Subject: [PATCH 016/597] cleanup buildbot prints --- .jenkins.sh | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.jenkins.sh b/.jenkins.sh index e9428d4765..f3b27238ba 100755 --- a/.jenkins.sh +++ b/.jenkins.sh @@ -13,8 +13,6 @@ export PATH=/usr/local/cuda/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH -GPUARRAY=none - # Set some default values : ${BUILDBOT_DIR:="$WORKSPACE/nightly_build"} # Jenkins workspace path # Can also set to "Debug", "Release" to go faster @@ -25,9 +23,6 @@ GPUARRAY=none # Parameters for nosetests : ${NOSE_PARAM="-v --with-xunit --xunit-file="} -date -hostname - mkdir -p ${BUILDBOT_DIR} cd ${BUILDBOT_DIR} @@ -64,8 +59,6 @@ export CPATH=${BUILDBOT_DIR}/local/include:${CPATH} # Build the pygpu modules (cd libgpuarray && python setup.py build_ext --inplace -I${BUILDBOT_DIR}/local/include -L${BUILDBOT_DIR}/local/lib) - -echo -n > ${BUILDBOT_DIR}/pygpu.log # Test it for dev in ${DEVICES_CUDA}; do echo "Testing pygpu for DEVICE=${dev}" @@ -76,7 +69,3 @@ for dev in ${DEVICES_OPENCL}; do echo "Testing pygpu for DEVICE=${dev}" DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml libgpuarray/pygpu/tests -e test_blas.py done - -env -date -hostname From 4ac162563f495a3b56a4c1c27a3d9991ecc6ee8a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Sep 2016 19:02:47 -0400 Subject: [PATCH 017/597] Check that arguments have some flags that apply otherwise the behaviour is strange. --- pygpu/_elemwise.pyx | 2 ++ pygpu/elemwise.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pygpu/_elemwise.pyx b/pygpu/_elemwise.pyx index 4fb80f6399..586fb50bc9 100644 --- a/pygpu/_elemwise.pyx +++ b/pygpu/_elemwise.pyx @@ -60,6 +60,8 @@ cdef class arg: self.a.flags |= GE_WRITE if scalar: self.a.flags |= GE_SCALAR + if self.a.flags == 0: + raise ValueError('no flags specified for arg %s' % (name,)) property name: def __get__(self): diff --git a/pygpu/elemwise.py b/pygpu/elemwise.py index 906c55b33d..348ee3fdd4 100644 --- a/pygpu/elemwise.py +++ b/pygpu/elemwise.py @@ -14,7 +14,7 @@ def _dtype(o): def as_argument(o, name, read=False, write=False): - if not read and not write: + if (not read) and (not write): raise ValueError('argument is neither read not write') return arg(name, _dtype(o), scalar=not isinstance(o, gpuarray.GpuArray), read=read, write=write) From 466a3461b0ea95c9e1196450133b2073bd907e06 Mon Sep 17 00:00:00 2001 From: Jonas Adler Date: Thu, 29 Sep 2016 18:01:26 +0200 Subject: [PATCH 018/597] Remove string definition with no length in gpuarray_buffer_opencl, see #262 --- src/gpuarray_buffer_opencl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index d073880112..1f913aed2c 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -37,7 +37,8 @@ static gpukernel *cl_newkernel(gpucontext *ctx, unsigned int count, const char *fname, unsigned int argcount, const int *types, int flags, int *ret, char **err_str); -static const char CL_CONTEXT_PREAMBLE[]; +static const char CL_CONTEXT_PREAMBLE[] = +"#define GA_WARP_SIZE %lu\n"; // to be filled by cl_make_ctx() static inline int cl_get_platform_count(unsigned int* platcount) { cl_uint nump; @@ -302,9 +303,6 @@ static const char CL_PREAMBLE[] = /* XXX: add complex types, quad types, and longlong */ /* XXX: add vector types */ -static const char CL_CONTEXT_PREAMBLE[] = - "#define GA_WARP_SIZE %lu\n"; // to be filled by cl_make_ctx() - static const char *get_error_string(cl_int err) { /* OpenCL 1.0 error codes */ switch (err) { From c0a944f046ddbee0baebce9babfbfac46da7d181 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 4 Oct 2016 19:00:12 -0400 Subject: [PATCH 019/597] Rename the script to be more obviously about PRs and fix it so it doesn't test master all the time. --- .jenkins-pr.sh | 54 ++++++++++++++++++++++++++++++++++++++ .jenkins.sh | 71 -------------------------------------------------- 2 files changed, 54 insertions(+), 71 deletions(-) create mode 100755 .jenkins-pr.sh delete mode 100755 .jenkins.sh diff --git a/.jenkins-pr.sh b/.jenkins-pr.sh new file mode 100755 index 0000000000..b4b2d16e04 --- /dev/null +++ b/.jenkins-pr.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Script for Jenkins continuous integration testing of libgpuarray + +# Print commands as they are executed +set -x + +# Anaconda python +export PATH=/usr/local/miniconda2/bin:$PATH + +# CUDA +export PATH=/usr/local/cuda/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH + +# Can also set to "Debug", "Release" to go faster +: ${GPUARRAY_CONFIG:="Release"} +# Set these to " " to disable (empty doesn't work) +: ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1" +: ${DEVICES_OPENCL:=" "} + +git rev-parse HEAD + +# Build libgpuarray and run C tests +mkdir build +(cd build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} && make) + +# Test on different devices +for dev in ${DEVICES_CUDA}; do + echo "Testing libgpuarray for DEVICE=${dev}" + (cd build && DEVICE=${dev} make test) +done +for dev in ${DEVICES_OPENCL}; do + echo "Testing libgpuarray for DEVICE=${dev}" + (cd build && DEVICE=${dev} make test) +done + +export LD_LIBRARY_PATH=`pwd`/lib:${LD_LIBRARY_PATH} +export LIBRARY_PATH=`pwd`/lib:${LIBRARY_PATH} +export CPATH=`pwd`/src:${CPATH} + +# Build the pygpu modules +python setup.py build_ext --inplace + +# Test it +for dev in ${DEVICES_CUDA}; do + echo "Testing pygpu for DEVICE=${dev}" + test=${BUILDBOT_DIR}/pygpu + DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests +done +for dev in ${DEVICES_OPENCL}; do + echo "Testing pygpu for DEVICE=${dev}" + DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests -e test_blas.py +done diff --git a/.jenkins.sh b/.jenkins.sh deleted file mode 100755 index f3b27238ba..0000000000 --- a/.jenkins.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -# Script for Jenkins continuous integration testing of libgpuarray - -# Print commands as they are executed -set -x - -# Anaconda python -export PATH=/usr/local/miniconda2/bin:$PATH - -# CUDA -export PATH=/usr/local/cuda/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH -export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH - -# Set some default values -: ${BUILDBOT_DIR:="$WORKSPACE/nightly_build"} # Jenkins workspace path -# Can also set to "Debug", "Release" to go faster -: ${GPUARRAY_CONFIG:="Release"} -# Set these to " " to disable (empty doesn't work) -: ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1" -: ${DEVICES_OPENCL:=" "} -# Parameters for nosetests -: ${NOSE_PARAM="-v --with-xunit --xunit-file="} - -mkdir -p ${BUILDBOT_DIR} -cd ${BUILDBOT_DIR} - -# Make fresh clone (with no history since we don't need it) -rm -rf libgpuarray -git clone --depth 1 "https://github.com/Theano/libgpuarray.git" - -(cd libgpuarray && echo "libgpuarray commit" && git rev-parse HEAD) - -# Clean up previous installs (to make sure no old files are left) -rm -rf local -mkdir local - -# Build libgpuarray and run C tests -mkdir libgpuarray/build -(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=${BUILDBOT_DIR}/local && make) - -# Test on different devices -for dev in ${DEVICES_CUDA}; do - echo "Testing libgpuarray for DEVICE=${dev}" - (cd libgpuarray/build && CK_DEFAULT_TIMEOUT=16 DEVICE=${dev} make test) -done -for dev in ${DEVICES_OPENCL}; do - echo "Testing libgpuarray for DEVICE=${dev}" - (cd libgpuarray/build && DEVICE=${dev} make test) -done - -# Finally install -(cd libgpuarray/build && make install) -export LD_LIBRARY_PATH=${BUILDBOT_DIR}/local/lib:${LD_LIBRARY_PATH} -export LIBRARY_PATH=${BUILDBOT_DIR}/local/lib:${LIBRARY_PATH} -export CPATH=${BUILDBOT_DIR}/local/include:${CPATH} - -# Build the pygpu modules -(cd libgpuarray && python setup.py build_ext --inplace -I${BUILDBOT_DIR}/local/include -L${BUILDBOT_DIR}/local/lib) - -# Test it -for dev in ${DEVICES_CUDA}; do - echo "Testing pygpu for DEVICE=${dev}" - test=${BUILDBOT_DIR}/pygpu - DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml libgpuarray/pygpu/tests -done -for dev in ${DEVICES_OPENCL}; do - echo "Testing pygpu for DEVICE=${dev}" - DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml libgpuarray/pygpu/tests -e test_blas.py -done From 5becc2685df4eb55e1a682f057617ad5889ff38d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 4 Oct 2016 19:18:55 -0400 Subject: [PATCH 020/597] Fix xunit output. --- .jenkins-pr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins-pr.sh b/.jenkins-pr.sh index b4b2d16e04..4a09da6ed4 100755 --- a/.jenkins-pr.sh +++ b/.jenkins-pr.sh @@ -43,9 +43,9 @@ export CPATH=`pwd`/src:${CPATH} python setup.py build_ext --inplace # Test it +test=pygpu for dev in ${DEVICES_CUDA}; do echo "Testing pygpu for DEVICE=${dev}" - test=${BUILDBOT_DIR}/pygpu DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests done for dev in ${DEVICES_OPENCL}; do From a077382f4aae44e5c597cba3cd8ca6e5f4feac01 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 4 Jun 2016 00:53:56 -0400 Subject: [PATCH 021/597] Initial support for MaxAndArgmax and GpuArray_maxandargmax(). What remains incomplete is a scheduler for the kernel, as noted in the source code. --- src/CMakeLists.txt | 1 + src/gpuarray_reduction.c | 548 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 549 insertions(+) create mode 100644 src/gpuarray_reduction.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3bdf2b6be2..d70726061c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,6 +56,7 @@ gpuarray_array_collectives.c gpuarray_kernel.c gpuarray_extension.c gpuarray_elemwise.c +gpuarray_reduction.c ) check_function_exists(strlcat HAVE_STRL) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c new file mode 100644 index 0000000000..550dda4921 --- /dev/null +++ b/src/gpuarray_reduction.c @@ -0,0 +1,548 @@ +/* Includes */ +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#endif + +#include +#include +#include +#if !defined(_MSC_VER) || _MSC_VER < 1600 +#include +#endif +#include +#include +#include + +#include "private.h" +#include "gpuarray/array.h" +#include "gpuarray/error.h" +#include "gpuarray/kernel.h" +#include "gpuarray/util.h" + +#include "util/strb.h" + + +/* Datatypes */ +struct GEN_KERNEL_CTX{ + int numIdx; + const int* isReduced; + int numRedIdx; + int numFreeIdx; + const char* dstMaxType; + const char* dstArgmaxType; +}; +typedef struct GEN_KERNEL_CTX GEN_KERNEL_CTX; + + + +/* Function prototypes */ +static int getRdxIdx (const int numIdx, + const int* isReduced); +static char* genkernel_maxandargmax (const int numIdx, + const int* isReduced, + const char* dstMaxType, + const char* dstArgmaxType); +static void appendKernel (strb* s, + GEN_KERNEL_CTX* ctx); +static void appendTypedefs (strb* s, + GEN_KERNEL_CTX* ctx); +static void appendPrototype (strb* s, + GEN_KERNEL_CTX* ctx); +static void appendIndexDeclarations(strb* s, + GEN_KERNEL_CTX* ctx); +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue); +static void appendRangeCalculations(strb* s, + GEN_KERNEL_CTX* ctx); +static void appendLoops (strb* s, + GEN_KERNEL_CTX* ctx); +static void appendLoopMacroDefs (strb* s, + GEN_KERNEL_CTX* ctx); +static void appendLoopOuter (strb* s, + GEN_KERNEL_CTX* ctx); +static void appendLoopInner (strb* s, + GEN_KERNEL_CTX* ctx); +static void appendLoopMacroUndefs (strb* s, + GEN_KERNEL_CTX* ctx); +static void scheduleMaxAndArgmax (size_t* blockSize, + size_t* gridSize, + const GpuArray* src, + const int* isReduced); +static void invokeMaxAndArgmax (GpuKernel* kernel, + const GpuArray* src, + const int* isReduced); + + +/* Function implementation */ + +/** + * @brief Computes simultaneously the maxima and the arguments of maxima over + * specified axes of the tensor. + * + * Returns two tensors of identical shape. Both tensors' axes are a subset of + * the axes of the original tensor. The axes to be reduced are specified by + * the caller, and the maxima and arguments of maxima are computed over them. + * + * @param [out] dstMax The resulting tensor of maxima + * @param [out] dstArgmax the resulting tensor of arguments at maxima + * @param [in] src The source tensor. + * @param [in] isReduced Either NULL, or an array of booleans of the same + * size as the dimensionality of the source tensor. + * Axis k is reduced if isReduced[k] is non-zero, + * and is preserved otherwise. + * @return GA_NO_ERROR if the operation was successful, or a non-zero error + * code otherwise. + */ + +GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, + GpuArray* dstArgmax, + const GpuArray* src, + const int* isReduced){ + /** + * Generate kernel source code + */ + + const char* dstMaxType = gpuarray_get_type(src->typecode) -> cluda_name; + const char* dstArgmaxType = gpuarray_get_type(GA_LONG) -> cluda_name; + const char* s = genkernel_maxandargmax(src->nd, + isReduced, + dstMaxType, + dstArgmaxType); + if(!s){return GA_MEMORY_ERROR;} + + /* Compile it */ + const int ARG_TYPECODE[8] = { + GA_POINTER, /* src */ + GA_POINTER, /* srcSteps */ + GA_POINTER, /* srcSize */ + GA_POINTER, /* numBlk */ + GA_POINTER, /* dstMax */ + GA_POINTER, /* dstMaxSteps */ + GA_POINTER, /* dstArgmax */ + GA_POINTER /* dstArgmaxSteps */ + }; + const size_t l = strlen(s); + GpuKernel kernel; + GpuKernel_init(&kernel, 0, 1, &s, &l, "maxandargmax", + 8, ARG_TYPECODE, 0, (char**)0); + + /* Invoke it. */ + invokeMaxAndArgmax(&kernel, src, isReduced); + + /* Return error code */ + return GA_NO_ERROR; +} + +/** + * Count the number of dimensions to be reduced. + */ + +static int getRdxIdx(const int numIdx, const int* isReduced){ + int i, countReduced; + for(i=0, countReduced = 0;idstMaxType); + strb_appendf(s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); + strb_appends(s, "\n"); + strb_appends(s, "\n"); + strb_appends(s, "\n"); +} +static void appendPrototype (strb* s, + GEN_KERNEL_CTX* ctx){ + strb_appends(s, "__global__ void maxandargmax(const T* src,\n"); + strb_appends(s, " const X* srcSteps,\n"); + strb_appends(s, " const X* srcSize,\n"); + strb_appends(s, " const X* blkNum,\n"); + strb_appends(s, " T* dstMax,\n"); + strb_appends(s, " const X* dstMaxSteps,\n"); + strb_appends(s, " X* dstArgmax,\n"); + strb_appends(s, " const X* dstArgmaxSteps)"); +} +static void appendIndexDeclarations(strb* s, + GEN_KERNEL_CTX* ctx){ + strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); + + strb_appends(s, "\tX bi0 = blockIdx.x, bi1 = blockIdx.y, bi2 = blockIdx.z;\n"); + strb_appends(s, "\tX bd0 = blockDim.x, bd1 = blockDim.y, bd2 = blockDim.z;\n"); + strb_appends(s, "\tX ti0 = threadIdx.x, ti1 = threadIdx.y, ti2 = threadIdx.z;\n"); + + strb_appends(s, "\t\n"); + strb_appends(s, "\t\n"); + strb_appends(s, "\t/* Free indices & Reduction indices */\n"); + + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Blk", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Dim", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Start", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "End", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "SStep", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "MStep", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "AStep", ";\n"); + appendIdxes (s, "\tX ", "i", ctx->numFreeIdx, ctx->numIdx, "PDim", ";\n"); + + strb_appends(s, "\t\n"); + strb_appends(s, "\t\n"); +} +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue){ + int i; + + prologue = prologue ? prologue : ""; + prefix = prefix ? prefix : ""; + suffix = suffix ? suffix : ""; + epilogue = epilogue ? epilogue : ""; + + strb_appends(s, prologue); + for(i=startIdx;inumIdx ;i++){/* i*Dim = srcSize[*]; */ + strb_appendf(s, "\ti%dDim = srcSize[%d];\n", i, i); + } + for(i=0;inumIdx ;i++){/* i*SStep = srcSteps[*]; */ + strb_appendf(s, "\ti%dSStep = srcSteps[%d];\n", i, i); + } + for(i=0;inumFreeIdx;i++){/* i*MStep = dstMaxSteps[*]; */ + strb_appendf(s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); + } + for(i=0;inumFreeIdx;i++){/* i*AStep = dstArgmaxSteps[*]; */ + strb_appendf(s, "\ti%dMStep = dstArgmaxSteps[%d];\n", i, i); + } + for(i=0;inumIdx ;i++){/* i*Blk = numBlk[*]; */ + strb_appendf(s, "\ti%dBlk = numBlk[%d];\n", i, i); + } + for(i=ctx->numIdx-1;i>=ctx->numFreeIdx;i--){/* i*PDim = ...; */ + /** + * If this is the last index, it's the first cumulative dimension + * product we generate, and thus we initialize to 1. + */ + + if(i == ctx->numIdx-1){ + strb_appendf(s, "\ti%dPDim = 1;\n", i); + }else{ + strb_appendf(s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i); + } + } + for(i=0;inumIdx ;i++){/* i*Start = ...; */ + /** + * The first 3 dimensions get to rely on hardware loops. + * The others, if any, have to use software looping beginning at 0. + */ + + if(i < 3){ + strb_appendf(s, "\ti%dStart = ((bi%d * bd%d) + ti%d) * i%dBlk;\n", i, i, i, i, i); + }else{ + strb_appendf(s, "\ti%dStart = 0;\n", i); + } + } + for(i=0;inumIdx ;i++){/* i*End = ...; */ + /** + * The first 3 dimensions get to rely on hardware loops. + * The others, if any, have to use software looping beginning at 0. + */ + + if(i < 3){ + strb_appendf(s, "\ti%dEnd = i%dStart + bd%d * i%dBlk;\n", i, i, i, i); + }else{ + strb_appendf(s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); + } + } + + strb_appends(s, "\t\n"); + strb_appends(s, "\t\n"); +} +static void appendLoops (strb* s, + GEN_KERNEL_CTX* ctx){ + strb_appends(s, "\t/**\n"); + strb_appends(s, "\t * FREE LOOPS.\n"); + strb_appends(s, "\t */\n"); + strb_appends(s, "\t\n"); + + appendLoopMacroDefs (s, ctx); + appendLoopOuter (s, ctx); + appendLoopMacroUndefs(s, ctx); +} +static void appendLoopMacroDefs (strb* s, + GEN_KERNEL_CTX* ctx){ + int i; + + /** + * FOROVER Macro + */ + + strb_appends(s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); + + /** + * ESCAPE Macro + */ + + strb_appends(s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); + + /** + * SRCINDEXER Macro + */ + + appendIdxes (s, "#define SRCINDEXER(", "i", 0, ctx->numIdx, "", ") src["); + for(i=0;inumIdx;i++){ + strb_appendf(s, "i%d*i%dSStep + \\\n ", i, i); + } + strb_appends(s, "0]\n"); + + /** + * RDXINDEXER Macro + */ + + appendIdxes (s, "#define RDXINDEXER(", "i", ctx->numFreeIdx, ctx->numIdx, "", ") ("); + for(i=ctx->numFreeIdx;inumIdx;i++){ + strb_appendf(s, "i%d*i%dPDim + \\\n ", i, i); + } + strb_appends(s, "0)\n"); + + /** + * DSTMINDEXER Macro + */ + + appendIdxes (s, "#define DSTMINDEXER(", "i", 0, ctx->numFreeIdx, "", ") dstMax["); + for(i=0;inumFreeIdx;i++){ + strb_appendf(s, "i%d*i%dMStep + \\\n ", i, i); + } + strb_appends(s, "0]\n"); + + /** + * DSTAINDEXER Macro + */ + + appendIdxes (s, "#define DSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") dstArgmax["); + for(i=0;inumFreeIdx;i++){ + strb_appendf(s, "i%d*i%dAStep + \\\n ", i, i); + } + strb_appends(s, "0]\n"); +} +static void appendLoopOuter (strb* s, + GEN_KERNEL_CTX* ctx){ + int i; + + /** + * Outer Loop Header Generation + */ + + for(i=0;inumFreeIdx;i++){ + strb_appendf(s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); + } + + /** + * Inner Loop Generation + */ + + appendLoopInner(s, ctx); + + /** + * Outer Loop Trailer Generation + */ + + for(i=0;inumFreeIdx;i++){ + strb_appends(s, "\t}\n"); + } +} +static void appendLoopInner (strb* s, + GEN_KERNEL_CTX* ctx){ + int i; + + /** + * Inner Loop Prologue + */ + + strb_appends(s, "\t/**\n"); + strb_appends(s, "\t * Reduction initialization.\n"); + strb_appends(s, "\t */\n"); + strb_appends(s, "\t\n"); + + appendIdxes (s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->numFreeIdx, "", ""); + if(ctx->numFreeIdx && ctx->numRedIdx){strb_appends(s, ",");} + appendIdxes (s, "", "i", ctx->numFreeIdx, ctx->numIdx, "Start", ");\n"); + + appendIdxes (s, "\tX maxI = RDXINDEXER(", "i", ctx->numFreeIdx, ctx->numIdx, "Start", ");\n"); + + strb_appends(s, "\t\n"); + strb_appends(s, "\t/**\n"); + strb_appends(s, "\t * REDUCTION LOOPS.\n"); + strb_appends(s, "\t */\n"); + strb_appends(s, "\t\n"); + + /** + * Inner Loop Header Generation + */ + + for(i=ctx->numFreeIdx;inumIdx;i++){ + strb_appendf(s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); + } + + /** + * Inner Loop Body Generation + */ + + appendIdxes (s, "\tT V = SRCINDEXER(", "i", 0, ctx->numIdx, "", ");\n"); + strb_appends(s, "\t\n"); + strb_appends(s, "\tif(V > maxV){\n"); + strb_appends(s, "\t\tmaxV = V;\n"); + appendIdxes (s, "\t\tmaxI = RDXINDEXER(", "i", ctx->numFreeIdx, ctx->numIdx, "", ");\n"); + strb_appends(s, "\t}\n"); + + /** + * Inner Loop Trailer Generation + */ + + for(i=ctx->numFreeIdx;inumIdx;i++){ + strb_appends(s, "\t}\n"); + } + strb_appends(s, "\t\n"); + + /** + * Inner Loop Epilogue Generation + */ + + strb_appends(s, "\t/**\n"); + strb_appends(s, "\t * Destination writeback.\n"); + strb_appends(s, "\t */\n"); + strb_appends(s, "\t\n"); + appendIdxes (s, "\tDSTMINDEXER(", "i", 0, ctx->numFreeIdx, "", ") = maxV;\n"); + appendIdxes (s, "\tDSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") = maxI;\n"); +} +static void appendLoopMacroUndefs (strb* s, + GEN_KERNEL_CTX* ctx){ + strb_appends(s, "\t#undef FOROVER\n"); + strb_appends(s, "\t#undef ESCAPE\n"); + strb_appends(s, "\t#undef SRCINDEXER\n"); + strb_appends(s, "\t#undef RDXINDEXER\n"); + strb_appends(s, "\t#undef DSTMINDEXER\n"); + strb_appends(s, "\t#undef DSTAINDEXER\n"); +} + + +/** + * FIXME: Implement a working scheduler and invoker. + * + * To schedule effectively the work across several dimensions of possibly + * ugly numbers, taking into account the limitations on thread & block + * scheduling, will require some library capable of providing a "factoring" + * of the tensor dimensions into "nice" prime numbers. Their product may + * be allowed to be larger than the original number, provided it remains + * within bounds. + */ + +/** + * Compute a good thread block size / grid size for Nvidia. + */ + +static void scheduleMaxAndArgmax (size_t* blockSize, + size_t* gridSize, + const GpuArray* src, + const int* isReduced){ + //int maxThreadPerBlock = 1024; + //int numFreeIdx = src->nd - getRdxIdx(src->nd, isReduced); + + /* Naive solution. Optimization is a tough problem. */ + blockSize[0] = blockSize[1] = blockSize[2] = 1; + gridSize [0] = gridSize [1] = gridSize [2] = 1; +} + +/** + * Invoke the kernel. + */ + +static void invokeMaxAndArgmax (GpuKernel* kernel, + const GpuArray* src, + const int* isReduced){ + size_t blockSize[3]; + size_t gridSize[3]; + + scheduleMaxAndArgmax(blockSize, gridSize, src, isReduced); + GpuKernel_call(kernel, + getRdxIdx(src->nd, isReduced), + blockSize, + gridSize, + 0, + NULL); +} + From 50428676a6140441b35a992a3b19293cff031e1c Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 6 Jun 2016 15:25:01 -0400 Subject: [PATCH 022/597] First batch of patches in response to review Easy fixes that were identified were patched. --- src/gpuarray_reduction.c | 108 ++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 63 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 550dda4921..3cf9691677 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -6,9 +6,7 @@ #include #include #include -#if !defined(_MSC_VER) || _MSC_VER < 1600 -#include -#endif +#include "gpuarray/config.h" #include #include #include @@ -23,7 +21,7 @@ /* Datatypes */ -struct GEN_KERNEL_CTX{ +struct gen_kernel_ctx{ int numIdx; const int* isReduced; int numRedIdx; @@ -31,7 +29,7 @@ struct GEN_KERNEL_CTX{ const char* dstMaxType; const char* dstArgmaxType; }; -typedef struct GEN_KERNEL_CTX GEN_KERNEL_CTX; +typedef struct gen_kernel_ctx gen_kernel_ctx; @@ -43,13 +41,13 @@ static char* genkernel_maxandargmax (const int numIdx, const char* dstMaxType, const char* dstArgmaxType); static void appendKernel (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendTypedefs (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendPrototype (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendIndexDeclarations(strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendIdxes (strb* s, const char* prologue, const char* prefix, @@ -58,17 +56,17 @@ static void appendIdxes (strb* s, const char* suffix, const char* epilogue); static void appendRangeCalculations(strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendLoops (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendLoopMacroDefs (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendLoopOuter (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendLoopInner (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void appendLoopMacroUndefs (strb* s, - GEN_KERNEL_CTX* ctx); + gen_kernel_ctx* ctx); static void scheduleMaxAndArgmax (size_t* blockSize, size_t* gridSize, const GpuArray* src, @@ -108,7 +106,7 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, */ const char* dstMaxType = gpuarray_get_type(src->typecode) -> cluda_name; - const char* dstArgmaxType = gpuarray_get_type(GA_LONG) -> cluda_name; + const char* dstArgmaxType = gpuarray_get_type(GA_SIZE) -> cluda_name; const char* s = genkernel_maxandargmax(src->nd, isReduced, dstMaxType, @@ -117,14 +115,14 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, /* Compile it */ const int ARG_TYPECODE[8] = { - GA_POINTER, /* src */ - GA_POINTER, /* srcSteps */ - GA_POINTER, /* srcSize */ - GA_POINTER, /* numBlk */ - GA_POINTER, /* dstMax */ - GA_POINTER, /* dstMaxSteps */ - GA_POINTER, /* dstArgmax */ - GA_POINTER /* dstArgmaxSteps */ + GA_BUFFER, /* src */ + GA_BUFFER, /* srcSteps */ + GA_BUFFER, /* srcSize */ + GA_BUFFER, /* numBlk */ + GA_BUFFER, /* dstMax */ + GA_BUFFER, /* dstMaxSteps */ + GA_BUFFER, /* dstArgmax */ + GA_BUFFER /* dstArgmaxSteps */ }; const size_t l = strlen(s); GpuKernel kernel; @@ -166,7 +164,7 @@ static char* genkernel_maxandargmax(const int numIdx, const char* dstMaxType, const char* dstArgmaxType){ /* Obtain the parameters of the reduction. */ - GEN_KERNEL_CTX ctx; + gen_kernel_ctx ctx; ctx.numIdx = numIdx; ctx.isReduced = isReduced; ctx.numRedIdx = getRdxIdx(ctx.numIdx, ctx.isReduced); @@ -174,30 +172,14 @@ static char* genkernel_maxandargmax(const int numIdx, ctx.dstMaxType = dstMaxType; ctx.dstArgmaxType = dstArgmaxType; - /** - * Allocate string buffer. - */ - - strb* s = strb_alloc(5*1024); - if(!s){return NULL;} - - /** - * Generate kernel - */ - - appendKernel(s, &ctx); - - /** - * Return the kernel. - */ - - char* kernelSource = strb_cstr(s); - strb_free(s); - return kernelSource; + strb s = STRB_STATIC_INIT; + strb_ensure(&s, 5*1024); + appendKernel(&s, &ctx); + return strb_cstr(&s); } static void appendKernel (strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ appendTypedefs (s, ctx); appendPrototype (s, ctx); strb_appends (s, "{\n"); @@ -207,7 +189,7 @@ static void appendKernel (strb* s, strb_appends (s, "}\n"); } static void appendTypedefs (strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ strb_appends(s, "/* Typedefs */\n"); strb_appendf(s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); strb_appendf(s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); @@ -216,18 +198,18 @@ static void appendTypedefs (strb* s, strb_appends(s, "\n"); } static void appendPrototype (strb* s, - GEN_KERNEL_CTX* ctx){ - strb_appends(s, "__global__ void maxandargmax(const T* src,\n"); - strb_appends(s, " const X* srcSteps,\n"); - strb_appends(s, " const X* srcSize,\n"); - strb_appends(s, " const X* blkNum,\n"); - strb_appends(s, " T* dstMax,\n"); - strb_appends(s, " const X* dstMaxSteps,\n"); - strb_appends(s, " X* dstArgmax,\n"); - strb_appends(s, " const X* dstArgmaxSteps)"); + gen_kernel_ctx* ctx){ + strb_appends(s, "KERNEL void maxandargmax(const T* src,\n"); + strb_appends(s, " const X* srcSteps,\n"); + strb_appends(s, " const X* srcSize,\n"); + strb_appends(s, " const X* blkNum,\n"); + strb_appends(s, " T* dstMax,\n"); + strb_appends(s, " const X* dstMaxSteps,\n"); + strb_appends(s, " X* dstArgmax,\n"); + strb_appends(s, " const X* dstArgmaxSteps)"); } static void appendIndexDeclarations(strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); strb_appends(s, "\tX bi0 = blockIdx.x, bi1 = blockIdx.y, bi2 = blockIdx.z;\n"); @@ -272,7 +254,7 @@ static void appendIdxes (strb* s, strb_appends(s, epilogue); } static void appendRangeCalculations(strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ int i; strb_appends(s, "\t/* Compute ranges for this thread. */\n"); @@ -333,7 +315,7 @@ static void appendRangeCalculations(strb* s, strb_appends(s, "\t\n"); } static void appendLoops (strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ strb_appends(s, "\t/**\n"); strb_appends(s, "\t * FREE LOOPS.\n"); strb_appends(s, "\t */\n"); @@ -344,7 +326,7 @@ static void appendLoops (strb* s, appendLoopMacroUndefs(s, ctx); } static void appendLoopMacroDefs (strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ int i; /** @@ -400,7 +382,7 @@ static void appendLoopMacroDefs (strb* s, strb_appends(s, "0]\n"); } static void appendLoopOuter (strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ int i; /** @@ -426,7 +408,7 @@ static void appendLoopOuter (strb* s, } } static void appendLoopInner (strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ int i; /** @@ -490,7 +472,7 @@ static void appendLoopInner (strb* s, appendIdxes (s, "\tDSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") = maxI;\n"); } static void appendLoopMacroUndefs (strb* s, - GEN_KERNEL_CTX* ctx){ + gen_kernel_ctx* ctx){ strb_appends(s, "\t#undef FOROVER\n"); strb_appends(s, "\t#undef ESCAPE\n"); strb_appends(s, "\t#undef SRCINDEXER\n"); From 808695fa3ef1046e8b6eae01bdb05d77729fc308 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 9 Jun 2016 12:56:24 -0400 Subject: [PATCH 023/597] Add cluda aliases. --- src/gpuarray_reduction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 3cf9691677..647e9cd56f 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -212,9 +212,9 @@ static void appendIndexDeclarations(strb* s, gen_kernel_ctx* ctx){ strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); - strb_appends(s, "\tX bi0 = blockIdx.x, bi1 = blockIdx.y, bi2 = blockIdx.z;\n"); - strb_appends(s, "\tX bd0 = blockDim.x, bd1 = blockDim.y, bd2 = blockDim.z;\n"); - strb_appends(s, "\tX ti0 = threadIdx.x, ti1 = threadIdx.y, ti2 = threadIdx.z;\n"); + strb_appends(s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); + strb_appends(s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); + strb_appends(s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(s, "\t\n"); strb_appends(s, "\t\n"); From caa22e1bba6fda3fac84812168569212ff53a7b2 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 15 Aug 2016 14:03:20 -0400 Subject: [PATCH 024/597] Add Integer Factoring Support to libgpuarray utils. Provides decently fast approximate integer primality checking and factoring library. The two most important APIs are: gaIIsPrime(): Constant-time primality checker giving correct answer for all uint64_t. Uses Miller-Rabin primality test with specially-selected witnesses. gaIFLInit(), gaIFactorize(): Integer factorizer. For any number `n` (uint64_t), attempts to produce a factor list `fl` (struct GA_FACTOR_LIST). This factor list can be constrained in two ways: - The product of the factors in fl can be permitted to grow beyond n, up to a maximum value maxN. This permits some slack in the factorization. - The factor list can be constrained to contain no factors greater than k; This is called a k-smoothness constraint. The runtime of gaIFactorize() is dependent on a complicated relationship between integer size, slack given and tightness of k-smoothness bound, but the test-cases in integerfactoring.c execute quickly. Rules of thumb: - When maxN >= 2*n, integer factoring terminates immediately and reports the power of two that lies in range [n, maxN]. - As maxN approaches n from above, running time explodes exponentially, although in relative terms maxN can be very close (1% above n) and still run quickly. - As k approaches 0, running time explodes exponentially, with the worst effects seen for large n, very small k and maxN not much greater than n. Extensive Doxygen documentation is present in integerfactoring.h. --- src/util/integerfactoring.c | 610 ++++++++++++++++++++++++++++++++++++ src/util/integerfactoring.h | 241 ++++++++++++++ 2 files changed, 851 insertions(+) create mode 100644 src/util/integerfactoring.c create mode 100644 src/util/integerfactoring.h diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c new file mode 100644 index 0000000000..b80b988b92 --- /dev/null +++ b/src/util/integerfactoring.c @@ -0,0 +1,610 @@ +/* Includes */ +#include +#include +#include +#include "integerfactoring.h" + + + +/** + * Static Function Prototypes + */ + +/** + * @brief Round up positive n to next power-of-2 and report its factorization. + */ + +static int gaIFactorizeNextPow2(uint64_t n, GA_FACTOR_LIST* fl); + + +/** + * Function Definitions + */ + +int gaICtz (uint64_t n){ +#if __GNUC__ >= 4 + return n ? __builtin_ctzll(n) : 64; +#else + int z; + + for(z=0;z<64;z++){ + if((n>>z) & 1){break;} + } + + return z; +#endif +} + +int gaIClz (uint64_t n){ +#if __GNUC__ >= 4 + return n ? __builtin_clzll(n) : 64; +#else + int z; + + for(z=63;z>=0;z--){ + if((n>>z) & 1){break;} + } + + return 63-z; +#endif +} + +uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ +#if (__GNUC__ >= 4) && defined(__x86_64__) + uint64_t r; + + asm( + "mul %2\n\t" + "div %3\n\t" + : "=&d"(r) /* Outputs */ + : "a"(a), "r"(b), "r"(m) /* Inputs */ + : "cc" + ); + + return r; +#elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) + /* Hardcore GCC 4.6+ optimization jazz */ + return ((unsigned __int128)a * (unsigned __int128)b) % m; +#else + const uint64_t TWOPOW32 = (uint64_t)1<<32; + int i; + + a %= m; + b %= m; + + if(m <= TWOPOW32){ + /** + * Fast path: When performing modulo arithmetic on values <= 2^32, + * (a*b) % m gives the correct answer. + */ + + return (a*b) % m; + }else{ + /** + * Slow path: Have to simulate 128-bit arithmetic long division. + */ + + uint64_t ah = a>>32; + uint64_t al = (uint32_t)a; + uint64_t bh = b>>32; + uint64_t bl = (uint32_t)b; + + uint64_t ahbh = ah*bh; + uint64_t ahbl = ah*bl; + uint64_t albh = al*bh; + uint64_t albl = al*bl; + + uint64_t md = ahbl+albh; + + uint64_t lo = albl + (md<<32); + uint64_t hi = ahbh + (md>>32); + + /* Propagate carry-outs from `md` and `lo` into `hi` */ + if(lo < albl){hi++;} + if(md < ahbl){hi+=TWOPOW32;} + + /** + * Begin 128-bit-by-64-bit remainder. + * + * 1) Cut down `hi` mod `m`. This implements the first few iterations + * of a shift-and-subtract loop, leaving only 64 iterations to go. + * 2) Iterate 64 times: + * 2.1) Shift left [hi:lo] by 1 bit, into [newHi:newLo]. + * 2.2) If: + * 2.2.1) newHi < hi, then there was an overflow into bit 128. + * The value [1:newHi:newLo] is definitely larger than + * m, so we subtract. This situation can only occur if + * m > 2^63. + * 2.2.2) newHi > m, then we must subtract m out of newHi in + * order to bring back newHi within the range [0, m). + * 3) The modulo is in hi. + */ + + hi %= m; + for(i=0;i<64;i++){ + uint64_t newLo = (lo<<1); + uint64_t newHi = (hi<<1) + (newLo m){newHi -= m;} + + hi = newHi; + lo = newLo; + } + + return hi; + } +#endif +} + +uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ + /** + * Special cases (order matters!): + * - A modulo of 0 makes no sense and a modulo of 1 implies a return value + * of 0, since the result must be integer. + * - An exponent of 0 requires a return value of 1. + * - A base of 0 or 1 requires a return value of 0 or 1. + * - An exponent of 1 requires a return value of x. + * - An exponent of 2 can be handled by the modulo multiplication directly. + */ + + if(m<=1){ + return 0; + } + + x %= m; + + if(a==0){ + return 1; + }else if(x<=1){ + return x; + }else if(a==1){ + return x; + }else if(a==2){ + return gaIMulMod(x,x,m); + } + + /** + * Otherwise, perform modular exponentiation by squaring. + */ + + uint64_t r = 1; + while(a){ + if(a&1){ + r = gaIMulMod(r, x, m); + } + + x = gaIMulMod(x, x, m); + a >>= 1; + } + + return r; +} + +int gaIIsPrime (uint64_t n){ + /** + * Check if it is 2, the oddest prime. + */ + + if(n==2){return 1;} + + /** + * Check if it is an even integer. + */ + + if((n&1) == 0){return 0;} + + /** + * For small integers, read directly the answer in a table. + */ + + if(n<256){ + return "nnyynynynnnynynnnynynnnynnnnnyny" + "nnnnnynnnynynnnynnnnnynnnnnynynn" + "nnnynnnynynnnnnynnnynnnnnynnnnnn" + "nynnnynynnnynynnnynnnnnnnnnnnnny" + "nnnynnnnnynynnnnnnnnnynynnnnnynn" + "nnnynnnynnnnnynnnnnynynnnnnnnnny" + "nynnnynynnnnnnnnnnnynnnnnnnnnnny" + "nnnynynnnynnnnnynynnnnnnnnnynnnn"[n] == 'y'; + } + + /** + * Test small prime factors. + */ + + int hasNoSmallFactors = n%3 && n%5 && n%7 && n%11 && n%13; + int hasSmallFactors = !hasNoSmallFactors; + if(hasSmallFactors){ + return 0; + } + + /** + * Otherwise proceed to the Miller-Rabin test. + * + * The Miller-Rabin test uses integer "witnesses" in an attempt at + * proving the number composite. Should it fail to prove an integer + * composite, it reports the number as "probably prime". However, if + * the witnesses are chosen carefully, the Miller-Rabin test can be made + * deterministic below a chosen threshold. In our case, we use the primes + * 2 to 37 in order to ensure the correctness of the identifications for + * integers under 2^64. + */ + + const uint64_t WITNESSES[] = {2,3,5,7,11,13,17,19,23,29,31,37}; + const int NUMWITNESSES = sizeof(WITNESSES)/sizeof(WITNESSES[0]); + size_t i, j; + + uint64_t r = gaICtz(n-1); + uint64_t d = (n-1)>>r; + + /* For each witness... */ + for(i=0;i 0 && maxN < n)){ + return 0; + } + + /** + * Handle special cases of n = 0,1,2. + */ + + if(n<=2){ + gaIFLInit(fl); + gaIFLAddFactors(fl, n, 1); + return 1; + } + + /** + * Magic-value arguments interpreted and canonicalized. + */ + + if(maxN == (uint64_t)-1 || gaIClz(maxN) < gaIClz(n)){ + /** + * Either we are allowed unlimited growth of n, or the slack space + * [n, maxN] is big enough to contain a power of 2. We identify, round + * up to and factorize the next higher power of 2 greater than or equal + * to n trivially. Since powers of 2 are by definition 2-smooth, we + * automatically satisfy the most stringent possible smoothness + * constraint. + */ + + return gaIFactorizeNextPow2(n, fl); + }else if(maxN == 0){ + /** + * We are asked for a strict factoring. + */ + + maxN = n; + } + + if(k == 0 || k >= n){ + /** + * We want no k-smoothness constraint. + */ + + k = n; + } + + + /** + * Master loop. + */ + + for(i=n; i <= maxN; i++){ + /** + * Do not manipulate the loop index! + * Initial subfactor to cut down is x=i. + */ + + x = i; + gaIFLInit(fl); + + /** + * Subfactorization always begins with an attempt at an initial + * cut-down by factors of 2. Should this result in a 1 (which isn't + * technically prime, but indicates a complete factorization), we + * report success. + */ + + subfactorize: + gaIFLAddFactors(fl, 2, gaICtz(x)); + x >>= gaICtz(x); + f = 3; + + /** + * Primality test. + * + * If the remaining factor x is a prime number, it's decision time. One + * of two things is true: + * + * 1) We have a smoothness constraint k and x is <= than it, or we + * don't have a smoothness constraint at all (k==n). Both cases are + * covered by checking x<=k. + * + * In this case we add x as the last factor to the factor list and + * return affirmatively. + * + * 2) We have a smoothness constraint and x>k. + * + * In this case we have to increment x and begin anew the + * sub-factorization. This may cause us to fail out of factorizing + * the current i, by exceeding our slack limit. If this happens we + * abort the factorization rooted at i and move to the next i. + */ + + primetest: + if(x==1 || gaIIsPrime(x)){ + if(x<=k){ + gaIFLAddFactors(fl, x, 1); + return 1; + }else{ + p = gaIFLGetProduct(fl); + if((maxN - p*x) < p){/* Overflow-free check maxN >= p*(x+1) */ + goto nextI; + }else{ + x++; + goto subfactorize; + } + } + } + + /** + * Composite number handler. + * + * We continue by trying to cut down x by factors of 3+. Should a trial + * division by a factor f succeed, all powers of f are factored out of + * x and once f no longer divides x evenly, a new primality test is + * run. The primality test will be invoked at most 15 times from this loop. + */ + + for(;f<=k && f*f<=x && f<=0xFFFFFFFFU;f+=2){/* Overflow-safe f*f */ + if(x%f == 0){ + c = 0; + do{ + x /= f; + c++; + }while(x%f == 0); + + gaIFLAddFactors(fl, f, c); + + goto primetest; + } + } + + /* Check before next iteration for 64-bit integer overflow. */ + nextI: if(i == 0xFFFFFFFFFFFFFFFF){break;} + } + + /* Failed to factorize. */ + return 0; +} + +static int gaIFactorizeNextPow2(uint64_t n, GA_FACTOR_LIST* fl){ + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n |= n >> 32; + n++; + + gaIFLInit(fl); + gaIFLAddFactors(fl, 2, gaICtz(n)); + + return 1; +} + +void gaIFLInit(GA_FACTOR_LIST* fl){ + memset(fl, 0, sizeof(*fl)); +} + +int gaIFLAddFactors(GA_FACTOR_LIST* fl, uint64_t f, uint8_t p){ + int i; + + /* Fast case: We're adding 0 powers of f. */ + if(p == 0){ + return 1; + } + + for(i=0;i<15;i++){ + if(fl->f[i] == f){ + /* Fast case: Factor already in list. */ + fl->p[i] += p; + return 1; + }else if(fl->f[i] > f){ + /* Inject the factor at this place in order to keep list sorted, + if we have the capacity. */ + + if(fl->f[14] != 0){ + /* We can't bump the list rightwards, it's full already! */ + return 0; + } + + memmove(&fl->f[i+1], &fl->f[i], sizeof(fl->f[i])*(14-i)); + memmove(&fl->p[i+1], &fl->p[i], sizeof(fl->p[i])*(14-i)); + fl->f[i] = f; + fl->p[i] = p; + return 1; + }else if(fl->f[i] == 0){ + /* This is the biggest factor so far, and a slot still remains. */ + fl->f[i] = f; + fl->p[i] = p; + return 1; + } + } + + return 0; +} + +int gaIFLGetFactorPower(GA_FACTOR_LIST* fl, uint64_t f){ + int i; + + for(i=0;i<15;i++){ + if(fl->f[i] == f){ + return fl->p[i]; + } + } + + return 0; +} + +uint64_t gaIFLGetProduct(const GA_FACTOR_LIST* fl){ + uint64_t p = 1; + int i, j; + + for(i=0;i<15;i++){ + for(j=0;jp[i];j++){ + p *= fl->f[i]; + } + } + + return p; +} + +uint64_t gaIFLGetGreatestFactor(const GA_FACTOR_LIST* fl){ + uint64_t f = 1; + int i; + + for(i=0;i<15;i++){ + if(f < fl->f[i]){ + f = fl->f[i]; + } + } + + return f; +} + +int gaIFLsnprintf(char* str, size_t size, const GA_FACTOR_LIST* fl){ + int i, j; + + int total = 0; + size_t left = size; + char* ptr = size ? str : NULL; + + /* Loop over all factors and spit them out. */ + for(i=0;i<15;i++){ + for(j=0;jp[i];j++){ + total += snprintf(ptr, left, "%llu*", (unsigned long long)fl->f[i]); + if(ptr){ + left -= strlen(ptr); + ptr += strlen(ptr); + } + } + } + + /* If no factors were printed, print 1. */ + if(total == 0){ + total += snprintf(ptr, left, "1*"); + if(ptr){ + left -= strlen(ptr); + ptr += strlen(ptr); + } + } + + /* Terminate buffer ('*' -> '\0') and deduct one character. */ + total--; + if(str && size > 0){ + if(total >= size){ + str[size-1] = '\0'; + }else{ + str[total] = '\0'; + } + } + + return total; +} + + +#if 0 +void runTest(uint64_t n, uint64_t maxN, uint64_t k){ + char buf[128]; + GA_FACTOR_LIST fl; + + int isPrime = gaIIsPrime(n); + printf("%llu %s prime.\n", (unsigned long long)n, isPrime ? "is" : "is not"); + + if(k==0 || k>=n || maxN==0 || maxN==n){ + printf("Attempting exact factorization of %llu.\n", (unsigned long long)n); + } + if(k>0 && k +#include +#include + + +/* Defines */ + + + +/* C++ Extern "C" Guard */ +#ifdef __cplusplus +extern "C" { +#endif + + + +/* Data Structure Prototypes & Typedefs */ +struct GA_FACTOR_LIST; +typedef struct GA_FACTOR_LIST GA_FACTOR_LIST; + + + +/* Data Structures */ + +/** + * @brief The GA_FACTOR_LIST struct. + * + * Contains the list of distinct prime factors of a 64-bit unsigned integer, as + * well as the powers of those factors. + * + * There can be at most 15 such distinct factors, since the product of the + * first 16 primes (2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53) exceeds + * the maximum unsigned number of 2^64-1. Moreover, there can be at most 63 + * factors all together, since 2^64 exceeds 2^64-1, so only an 8-bit number is + * required to store the powers. + * + * The 15th (last) element of the factor list is always 0 and has power 0, + * and serves as a sort of sentinel. + */ + +struct GA_FACTOR_LIST{ + uint64_t f[16];/* Factors */ + uint8_t p[16];/* Powers of factors */ +}; + + + +/* Functions */ + +/** + * @brief Count trailing zeros of a 64-bit integer. + * + * @param [in] n The integer whose trailing zero count is to be computed. + * @return If n != 0, returns trailing zero count; Else returns 64. + */ + +int gaICtz(uint64_t n); + +/** + * @brief Count leading zeros of a 64-bit integer. + * + * @param [in] n The integer whose leading zero count is to be computed. + * @return If n != 0, returns leading zero count; Else returns 64. + */ + +int gaIClz(uint64_t n); + +/** + * @brief Integer Modular Multiplication. + * + * Computes + * + * $$a*b \pmod m$$ + * + * efficiently for 64-bit unsigned integers a, b, m. + */ + +uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m); + +/** + * @brief Integer Modular Exponentiation. + * + * Computes + * + * $$x^a \pmod m$$ + * + * efficiently for 64-bit unsigned integers x, a, m. + */ + +uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); + +/** + * @brief Checks whether an integer is prime. + * + * @param [in] n The integer whose primality is to be checked. + * @return 1 if prime; 0 if not prime. + * + * NB: This is *not* a probabilistic primality checker. For all integers it can + * be given as input, it will correctly report "prime" or "composite". + * NB: Internally, this function uses the Miller-Rabin test, which *is* + * probabilistic, and may falsely report a number as prime when in fact it + * is composite. However, this function uses a deterministic set of + * Miller-Rabin "witnesses", which ensures that there are no strong + * probable primes equal to or below 2^64-1 (the size of the input + * argument). This set of witnesses is + * + * $$a = 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, and 37$$ + * + * See https://oeis.org/A014233 + */ + +int gaIIsPrime(uint64_t n); + +/** + * @brief Factorize a positive integer into a list of factors satisfying + * certain properties. + * + * The function factorizes a 64-bit, positive integer into a list of factors. + * This factorization can be made "approximate"; That is, the product of the + * factors returned can be slightly greater than the input number. The + * maximum increase is controlled by a "slack" parameter maxN, as follows: + * + * $$\texttt{n} \le \prod(\mathrm{fact}(\texttt{n}) \le \texttt{maxN}$$ + * + * The advantage of offering some slack to the factorizer is that in return, + * the factorizer may succeed in outputting a factorization with smaller + * factors. The maxN slack parameter must be 0 or be greater than or equal to + * n, but it is useless to set it beyond twice the value of n. + * + * When maxN is equal to -1 (2^64 - 1), or is greater than or equal to 2n, + * there is a guarantee that there exists a power of two that lies between n + * and 2n. Since this factorization involves only powers of the smallest prime + * (2), it is a valid factorization under any valid k-smoothness constraint, + * and so will be returned. + * + * When maxN is equal to 0 or n (no increase in value allowed), this implies + * that an exact factoring is requested. + * + * The factorization can also be constrained by a (k)-smoothness constraint. + * A k-smooth number n has no prime factors greater than k. If the factorizer + * is asked to factor with k-smoothness a number with prime factors greater + * than k, it will search, within the slack space, for a slightly larger + * number that is k-smooth and return that number's factoring. With maxN == n + * and a k-smoothness constraint, this function reports whether or not the + * number is k-smooth. + * + * When k is equal to 0, equal to -1 (2^64 - 1), or is greater than or equal + * to n, no k-smoothness constraints are imposed. An exact factoring is + * required. + * + * @param [in] n The integer to be factorized. Must be >0. + * @param [in] maxN The "slack" parameter. The factor list returned will not + * have a product that exceeds this number. + * @param [in] k The k-smoothness constraint. k is the largest + * acceptable factor in the output factor list. The + * factorizer will, effectively, treat any number all of + * whose prime factors exceed k as a prime. + * @param [out] fl The output factor list. + * @return Non-zero if a factorization is found that satisfies both slack and + * smoothness constraints; Zero if no such factorization is found. + * If this function returns zero, the last factor in the factor + * list is not guaranteed to be prime. + */ + +int gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, GA_FACTOR_LIST* fl); + +/** + * @brief Initialize a factors list to all-factors- and all-powers-zero. + * + * Such a factors list represents 1, since 0^0 = 1. + */ + +void gaIFLInit(GA_FACTOR_LIST* fl); + +/** + * @brief Add a factor f with power p to the factor list. + * + * If factor f was already present in the factor list, increments + * the corresponding power by p. Otherwise, adds the new factor f to + * the list, if there is still space, and sets the power to p. + * + * Maintains factor list in sorted order. + * + * @return Non-zero if factor successfully added; Zero otherwise. + */ + +int gaIFLAddFactors(GA_FACTOR_LIST* fl, uint64_t f, uint8_t p); + +/** + * @brief Get the power of a given factor within a factor list. + * + * @return The number of times a factor occurs within the current + * factorization. If it does not occur, return 0. + */ + +int gaIFLGetFactorPower(GA_FACTOR_LIST* fl, uint64_t f); + +/** + * @brief Compute the product of the factors stored in the factors list. + */ + +uint64_t gaIFLGetProduct(const GA_FACTOR_LIST* fl); + +/** + * @brief Get the greatest factor in the factors list. + */ + +uint64_t gaIFLGetGreatestFactor(const GA_FACTOR_LIST* fl); + +/** + * @brief Print out the factor list in a human-readable form, snprintf()-style. + * + * @param [out] str A string into which to print out the factor list. If the + * factor list is a result of gaIFactorize(), then the + * maximum length of buffer required is 128 bytes. + * If str is NULL, nothing is printed. + * @param [in] size The maximum number of bytes written, including the + * terminating NUL (\0) character. + * @param [in] fl The factor list to be printed. + * @return The number of characters that would have been printed + * out, assuming an unbounded, non-NULL buffer. + */ + +int gaIFLsnprintf(char* str, size_t size, const GA_FACTOR_LIST* fl); + + +/* End C++ Extern "C" Guard */ +#ifdef __cplusplus +} +#endif + + +/* End Include Guards */ +#endif + From fd50ce5cfb2355baa52ad24ce5ec90937dff4ec5 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 15 Aug 2016 15:07:32 -0400 Subject: [PATCH 025/597] Bug fixes + Partial scheduler code. - The GA_SIZE arguments were missing in the array of arguments declared to libgpuarray. - Half-completed scheduler using integer factoring utilities, which aims to select a # of threads within the range [64, 256]. Still needs work and isn't always guaranteed to work. E.g. If factor list is [2,2,2,2,2,17], currently the algorithm will refuse to either give 32 or 544 and instead fail. --- src/gpuarray_reduction.c | 161 ++++++++++++++++++++++++++++++++------- 1 file changed, 133 insertions(+), 28 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 647e9cd56f..e09eba5bed 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -18,6 +18,7 @@ #include "gpuarray/util.h" #include "util/strb.h" +#include "util/integerfactoring.h" /* Datatypes */ @@ -102,7 +103,7 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, const GpuArray* src, const int* isReduced){ /** - * Generate kernel source code + * Generate kernel source code. */ const char* dstMaxType = gpuarray_get_type(src->typecode) -> cluda_name; @@ -113,23 +114,36 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, dstArgmaxType); if(!s){return GA_MEMORY_ERROR;} - /* Compile it */ - const int ARG_TYPECODE[8] = { + + /** + * Compile it. + */ + + const int ARG_TYPECODE[11] = { GA_BUFFER, /* src */ + GA_SIZE, /* srcOff */ GA_BUFFER, /* srcSteps */ GA_BUFFER, /* srcSize */ GA_BUFFER, /* numBlk */ GA_BUFFER, /* dstMax */ + GA_SIZE, /* dstMaxOff */ GA_BUFFER, /* dstMaxSteps */ GA_BUFFER, /* dstArgmax */ + GA_SIZE, /* dstArgmaxOff */ GA_BUFFER /* dstArgmaxSteps */ }; + const size_t l = strlen(s); + GpuKernel kernel; GpuKernel_init(&kernel, 0, 1, &s, &l, "maxandargmax", 8, ARG_TYPECODE, 0, (char**)0); - /* Invoke it. */ + + /** + * Invoke it. + */ + invokeMaxAndArgmax(&kernel, src, isReduced); /* Return error code */ @@ -208,6 +222,15 @@ static void appendPrototype (strb* s, strb_appends(s, " X* dstArgmax,\n"); strb_appends(s, " const X* dstArgmaxSteps)"); } +static void appendOffsets (strb* s, + gen_kernel_ctx* ctx){ + strb_appends(s, "/* Add offsets */\n"); + strb_appends(s, "src = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); + strb_appends(s, "dstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); + strb_appends(s, "dstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); + strb_appends(s, "\n"); + strb_appends(s, "\n"); +} static void appendIndexDeclarations(strb* s, gen_kernel_ctx* ctx){ strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); @@ -481,46 +504,128 @@ static void appendLoopMacroUndefs (strb* s, strb_appends(s, "\t#undef DSTAINDEXER\n"); } - -/** - * FIXME: Implement a working scheduler and invoker. - * - * To schedule effectively the work across several dimensions of possibly - * ugly numbers, taking into account the limitations on thread & block - * scheduling, will require some library capable of providing a "factoring" - * of the tensor dimensions into "nice" prime numbers. Their product may - * be allowed to be larger than the original number, provided it remains - * within bounds. - */ - /** * Compute a good thread block size / grid size for Nvidia. */ -static void scheduleMaxAndArgmax (size_t* blockSize, - size_t* gridSize, - const GpuArray* src, - const int* isReduced){ - //int maxThreadPerBlock = 1024; - //int numFreeIdx = src->nd - getRdxIdx(src->nd, isReduced); +static void scheduleMaxAndArgmax (const GpuKernel* kernel, + const GpuArray* src, + const int* isReduced, + size_t* blockSize, + size_t* gridSize){ + int i, j; + + /* Obtain the constraints of our problem. */ + size_t warpSize, + maxL, maxL0, maxL1, maxL2, /* Maximum total and per-dimension thread/block sizes */ + maxG, maxG0, maxG1, maxG2; /* Maximum total and per-dimension block /grid sizes */ + gpukernel_property(kernel->k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); + gpukernel_property(kernel->k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + gpudata_property (src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); + gpudata_property (src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); + gpudata_property (src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); + gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE, &maxG); + gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); + gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); + gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); + + int numRdxIdx = getRdxIdx(src->nd, isReduced); + int numFreeIdx = src->nd - numRdxIdx; + + /** + * Select which reduction dimensions will be associated with which hardware + * x, y and z dimensions. + */ + + int dims [3]; + uint64_t dimSize [3] = { 1, 1, 1}; + double slack [3] = {1.1, 1.1, 1.1}; + uint64_t kSmooth [3]; + GA_FACTOR_LIST factDims[3]; + GA_FACTOR_LIST factTBS [3]; + uint64_t tBS = 1; + uint64_t minThrd = 64; + uint64_t maxThrd = 256; + + /************************************************************************ + * FIXME: Need logic to select up to 3 dimensions and plug them in dimSize! + * But what's the best dimension selection strategy to maximize + * memory bandwidth? + * Also need to fill out kSmooth[] based on all the GPU properties. + ************************************************************************/ + kSmooth[0] = maxL0; + kSmooth[1] = maxL1; + kSmooth[2] = maxL2; + + /** + * Factorization job. We'll steadily increase the slack in case of failure + * in order to ensure we do get a factorization. + */ - /* Naive solution. Optimization is a tough problem. */ - blockSize[0] = blockSize[1] = blockSize[2] = 1; - gridSize [0] = gridSize [1] = gridSize [2] = 1; + for(i=0;i 0){ + factDims[i].p[j]--; + gaIFLAddFactors(&factTBS[i], factDims[i].f[j], 1); + tBS *= factDims[i].f[j]; + + if(tBS >= minThrd && tBS <= maxThrd){ + goto computeBS; + } + } + } + } + + computeBS: + blockSize[0] = gaIFLGetProduct(&factTBS[0]); + blockSize[1] = gaIFLGetProduct(&factTBS[1]); + blockSize[2] = gaIFLGetProduct(&factTBS[2]); + gridSize [0] = gaIFLGetProduct(&factDims[0]) / blockSize[0]; + gridSize [1] = gaIFLGetProduct(&factDims[1]) / blockSize[1]; + gridSize [2] = gaIFLGetProduct(&factDims[2]) / blockSize[2]; } /** * Invoke the kernel. */ -static void invokeMaxAndArgmax (GpuKernel* kernel, +static void invokeMaxAndArgmax (GpuKernel* k, const GpuArray* src, const int* isReduced){ size_t blockSize[3]; size_t gridSize[3]; - scheduleMaxAndArgmax(blockSize, gridSize, src, isReduced); - GpuKernel_call(kernel, + scheduleMaxAndArgmax(k, src, isReduced, blockSize, gridSize); + GpuKernel_call(k, getRdxIdx(src->nd, isReduced), blockSize, gridSize, From b508433d44ca7d868a0ed9c4cfdeeb0d3d6c77c1 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 22 Aug 2016 14:31:03 -0400 Subject: [PATCH 026/597] Apply feedback on Integer Factoring Library. - Made modular arithmetic APIs in integerfactoring.h private. - Added test_integerfactorization. Testsuite check_util_integerfactoring passes on my machine. - Deleted random stdint-gcc.h include. - GA_FACTOR_LIST -> ga_factor_list_. - Added integer factoring library and tests to CMakeLists. --- src/util/CMakeLists.txt | 1 + src/util/integerfactoring.c | 111 ++++++++++++++-------------- src/util/integerfactoring.h | 63 +++------------- tests/CMakeLists.txt | 4 + tests/check_util_integerfactoring.c | 74 +++++++++++++++++++ 5 files changed, 145 insertions(+), 108 deletions(-) create mode 100644 tests/check_util_integerfactoring.c diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 7ae772cb67..61a603b44a 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -1,4 +1,5 @@ set_rel(UTIL_SRC strb.c xxhash.c +integerfactoring.c ) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index b80b988b92..88f344f084 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -10,18 +10,61 @@ * Static Function Prototypes */ +/** + * @brief Count trailing zeros of a 64-bit integer. + * + * @param [in] n The integer whose trailing zero count is to be computed. + * @return If n != 0, returns trailing zero count; Else returns 64. + */ + +static int gaICtz(uint64_t n); + +/** + * @brief Count leading zeros of a 64-bit integer. + * + * @param [in] n The integer whose leading zero count is to be computed. + * @return If n != 0, returns leading zero count; Else returns 64. + */ + +static int gaIClz(uint64_t n); + +/** + * @brief Integer Modular Multiplication. + * + * Computes + * + * $$a*b \pmod m$$ + * + * efficiently for 64-bit unsigned integers a, b, m. + */ + +static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m); + +/** + * @brief Integer Modular Exponentiation. + * + * Computes + * + * $$x^a \pmod m$$ + * + * efficiently for 64-bit unsigned integers x, a, m. + */ + +static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); + /** * @brief Round up positive n to next power-of-2 and report its factorization. */ -static int gaIFactorizeNextPow2(uint64_t n, GA_FACTOR_LIST* fl); +static int gaIFactorizeNextPow2(uint64_t n, ga_factor_list* fl); + /** * Function Definitions */ -int gaICtz (uint64_t n){ +static int gaICtz (uint64_t n){ #if __GNUC__ >= 4 return n ? __builtin_ctzll(n) : 64; #else @@ -35,7 +78,7 @@ int gaICtz (uint64_t n){ #endif } -int gaIClz (uint64_t n){ +static int gaIClz (uint64_t n){ #if __GNUC__ >= 4 return n ? __builtin_clzll(n) : 64; #else @@ -49,7 +92,7 @@ int gaIClz (uint64_t n){ #endif } -uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ +static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ #if (__GNUC__ >= 4) && defined(__x86_64__) uint64_t r; @@ -136,7 +179,7 @@ uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ #endif } -uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ +static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ /** * Special cases (order matters!): * - A modulo of 0 makes no sense and a modulo of 1 implies a return value @@ -283,7 +326,7 @@ int gaIIsPrime (uint64_t n){ return 1; } -int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, GA_FACTOR_LIST* fl){ +int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl){ uint64_t i, x, p, f, c; /** @@ -429,7 +472,7 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, GA_FACTOR_LIST* fl return 0; } -static int gaIFactorizeNextPow2(uint64_t n, GA_FACTOR_LIST* fl){ +static int gaIFactorizeNextPow2(uint64_t n, ga_factor_list* fl){ n--; n |= n >> 1; n |= n >> 2; @@ -445,11 +488,11 @@ static int gaIFactorizeNextPow2(uint64_t n, GA_FACTOR_LIST* fl){ return 1; } -void gaIFLInit(GA_FACTOR_LIST* fl){ +void gaIFLInit(ga_factor_list* fl){ memset(fl, 0, sizeof(*fl)); } -int gaIFLAddFactors(GA_FACTOR_LIST* fl, uint64_t f, uint8_t p){ +int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, uint8_t p){ int i; /* Fast case: We're adding 0 powers of f. */ @@ -487,7 +530,7 @@ int gaIFLAddFactors(GA_FACTOR_LIST* fl, uint64_t f, uint8_t p){ return 0; } -int gaIFLGetFactorPower(GA_FACTOR_LIST* fl, uint64_t f){ +int gaIFLGetFactorPower(ga_factor_list* fl, uint64_t f){ int i; for(i=0;i<15;i++){ @@ -499,7 +542,7 @@ int gaIFLGetFactorPower(GA_FACTOR_LIST* fl, uint64_t f){ return 0; } -uint64_t gaIFLGetProduct(const GA_FACTOR_LIST* fl){ +uint64_t gaIFLGetProduct(const ga_factor_list* fl){ uint64_t p = 1; int i, j; @@ -512,7 +555,7 @@ uint64_t gaIFLGetProduct(const GA_FACTOR_LIST* fl){ return p; } -uint64_t gaIFLGetGreatestFactor(const GA_FACTOR_LIST* fl){ +uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl){ uint64_t f = 1; int i; @@ -525,7 +568,7 @@ uint64_t gaIFLGetGreatestFactor(const GA_FACTOR_LIST* fl){ return f; } -int gaIFLsnprintf(char* str, size_t size, const GA_FACTOR_LIST* fl){ +int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl){ int i, j; int total = 0; @@ -566,45 +609,3 @@ int gaIFLsnprintf(char* str, size_t size, const GA_FACTOR_LIST* fl){ } -#if 0 -void runTest(uint64_t n, uint64_t maxN, uint64_t k){ - char buf[128]; - GA_FACTOR_LIST fl; - - int isPrime = gaIIsPrime(n); - printf("%llu %s prime.\n", (unsigned long long)n, isPrime ? "is" : "is not"); - - if(k==0 || k>=n || maxN==0 || maxN==n){ - printf("Attempting exact factorization of %llu.\n", (unsigned long long)n); - } - if(k>0 && k #include -#include /* Defines */ @@ -21,8 +20,8 @@ extern "C" { /* Data Structure Prototypes & Typedefs */ -struct GA_FACTOR_LIST; -typedef struct GA_FACTOR_LIST GA_FACTOR_LIST; +struct ga_factor_list_; +typedef struct ga_factor_list_ ga_factor_list; @@ -44,7 +43,7 @@ typedef struct GA_FACTOR_LIST GA_FACTOR_LIST; * and serves as a sort of sentinel. */ -struct GA_FACTOR_LIST{ +struct ga_factor_list_{ uint64_t f[16];/* Factors */ uint8_t p[16];/* Powers of factors */ }; @@ -53,48 +52,6 @@ struct GA_FACTOR_LIST{ /* Functions */ -/** - * @brief Count trailing zeros of a 64-bit integer. - * - * @param [in] n The integer whose trailing zero count is to be computed. - * @return If n != 0, returns trailing zero count; Else returns 64. - */ - -int gaICtz(uint64_t n); - -/** - * @brief Count leading zeros of a 64-bit integer. - * - * @param [in] n The integer whose leading zero count is to be computed. - * @return If n != 0, returns leading zero count; Else returns 64. - */ - -int gaIClz(uint64_t n); - -/** - * @brief Integer Modular Multiplication. - * - * Computes - * - * $$a*b \pmod m$$ - * - * efficiently for 64-bit unsigned integers a, b, m. - */ - -uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m); - -/** - * @brief Integer Modular Exponentiation. - * - * Computes - * - * $$x^a \pmod m$$ - * - * efficiently for 64-bit unsigned integers x, a, m. - */ - -uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); - /** * @brief Checks whether an integer is prime. * @@ -168,7 +125,7 @@ int gaIIsPrime(uint64_t n); * list is not guaranteed to be prime. */ -int gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, GA_FACTOR_LIST* fl); +int gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl); /** * @brief Initialize a factors list to all-factors- and all-powers-zero. @@ -176,7 +133,7 @@ int gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, GA_FACTOR_LIST* fl) * Such a factors list represents 1, since 0^0 = 1. */ -void gaIFLInit(GA_FACTOR_LIST* fl); +void gaIFLInit(ga_factor_list* fl); /** * @brief Add a factor f with power p to the factor list. @@ -190,7 +147,7 @@ void gaIFLInit(GA_FACTOR_LIST* fl); * @return Non-zero if factor successfully added; Zero otherwise. */ -int gaIFLAddFactors(GA_FACTOR_LIST* fl, uint64_t f, uint8_t p); +int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, uint8_t p); /** * @brief Get the power of a given factor within a factor list. @@ -199,19 +156,19 @@ int gaIFLAddFactors(GA_FACTOR_LIST* fl, uint64_t f, uint8_t p); * factorization. If it does not occur, return 0. */ -int gaIFLGetFactorPower(GA_FACTOR_LIST* fl, uint64_t f); +int gaIFLGetFactorPower(ga_factor_list* fl, uint64_t f); /** * @brief Compute the product of the factors stored in the factors list. */ -uint64_t gaIFLGetProduct(const GA_FACTOR_LIST* fl); +uint64_t gaIFLGetProduct(const ga_factor_list* fl); /** * @brief Get the greatest factor in the factors list. */ -uint64_t gaIFLGetGreatestFactor(const GA_FACTOR_LIST* fl); +uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl); /** * @brief Print out the factor list in a human-readable form, snprintf()-style. @@ -227,7 +184,7 @@ uint64_t gaIFLGetGreatestFactor(const GA_FACTOR_LIST* fl); * out, assuming an unbounded, non-NULL buffer. */ -int gaIFLsnprintf(char* str, size_t size, const GA_FACTOR_LIST* fl); +int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl); /* End C++ Extern "C" Guard */ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 207db2f7aa..f7ccbae8b8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -48,6 +48,10 @@ add_executable(check_util main.c check_util.c) target_link_libraries(check_util ${CHECK_LIBRARIES} gpuarray) add_test(test_util "${CMAKE_CURRENT_BINARY_DIR}/check_util") +add_executable(check_util_integerfactoring main.c check_util_integerfactoring.c) +target_link_libraries(check_util_integerfactoring ${LIBS} gpuarray-static) +add_test(test_util_integerfactoring ${CMAKE_CURRENT_BINARY_DIR}/check_util_integerfactoring) + add_executable(check_array main.c device.c check_array.c) target_link_libraries(check_array ${CHECK_LIBRARIES} gpuarray) add_test(test_array "${CMAKE_CURRENT_BINARY_DIR}/check_array") diff --git a/tests/check_util_integerfactoring.c b/tests/check_util_integerfactoring.c new file mode 100644 index 0000000000..0815e03740 --- /dev/null +++ b/tests/check_util_integerfactoring.c @@ -0,0 +1,74 @@ +/* Includes */ +#include +#include +#include +#include +#include +#include +#include "util/integerfactoring.h" + +/** + * Integer Factorization test + */ + +START_TEST(test_integerfactorization) +{ + ga_factor_list fl; + + /** + * Attempt exact factorization for 2^64-1, no k-smoothness constraint. + * Expected PASS with 3*5*17*257*641*65537*6700417 + */ + + ck_assert_int_ne(gaIFactorize(18446744073709551615ULL, 0, 0, &fl), 0); + + /** + * Attempt exact factorization for 2^64-1, 4096-smooth constraint. + * Expected FAIL, because 2^64-1 possesses prime factors in excess of 4096. + */ + + ck_assert_int_eq(gaIFactorize(18446744073709551615ULL, 0, 4096, &fl), 0); + + /** + * Attempt approximate factorization for 2^64-1, no k-smoothness constraint. + * Unlimited growth permitted. + * Expected PASS, since 2^64-1 rounds up to 2^64 and 2^64 trivially factorizes. + */ + + ck_assert_int_ne(gaIFactorize(18446744073709551615ULL, -1, 0, &fl), 0); + + /** + * Attempt exact factorization for 2196095973992233039, no k-smoothness constraint. + * 2196095973992233039 is a large, highly non-smooth number, with three enormous + * factors. + * Expected PASS *very quickly*, since it factorizes as 1299817*1299821*1299827 + */ + + ck_assert_int_ne(gaIFactorize( 2196095973992233039ULL, 0, 0, &fl), 0); + + /** + * Attempt approximate factorization for 2196095973992233039, 64-smooth constraint. + * 2196095973992233039 is a large, highly non-smooth number, with three enormous + * factors. It is not 64-smooth, so code paths that attempt approximate + * factorization within the growth limits (1%) are exercised. + * + * Expected PASS *relatively quickly*. + */ + + ck_assert_int_ne(gaIFactorize( 2196095973992233039ULL, 2196095973992233039ULL*1.01, 64, &fl), 0); +} +END_TEST + + + +Suite *get_suite(void){ + Suite *s = suite_create("util_integerfactoring"); + TCase *tc = tcase_create("All"); + + tcase_add_test(tc, test_integerfactorization); + + suite_add_tcase(s, tc); + + return s; +} + From 53dda877b457f1e0123b4d086be5fff0318e7788 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 29 Aug 2016 13:51:01 -0400 Subject: [PATCH 027/597] Interface change, argument marshalling + bugfixes. - The GpuArray_maxandargmax() interface has been changed, because it was underspecified. Now, instead of giving a fixed-length vector of booleans indicating which axes are to be reduced and which not, one now gives an ordered list of integers indicating the axes to be reduced. This order matters in computing the output indices of Argmax. - Argument marshalling in invokeMaxAndArgmax() is complete. - Bugfixes all over the place. Remaining work: - Axis reordering in appendRangeCalculations(). - Make scheduleMaxAndArgmax() actually work. --- src/gpuarray_reduction.c | 403 +++++++++++++++++++++++++++------------ 1 file changed, 286 insertions(+), 117 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index e09eba5bed..f78c8677e9 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -23,58 +23,70 @@ /* Datatypes */ struct gen_kernel_ctx{ - int numIdx; - const int* isReduced; - int numRedIdx; - int numFreeIdx; - const char* dstMaxType; - const char* dstArgmaxType; + unsigned numIdx; + unsigned reduxLen; + const unsigned* reduxList; + unsigned numFreeIdx; + const char* dstMaxType; + const char* dstArgmaxType; }; typedef struct gen_kernel_ctx gen_kernel_ctx; /* Function prototypes */ -static int getRdxIdx (const int numIdx, - const int* isReduced); -static char* genkernel_maxandargmax (const int numIdx, - const int* isReduced, - const char* dstMaxType, - const char* dstArgmaxType); -static void appendKernel (strb* s, - gen_kernel_ctx* ctx); -static void appendTypedefs (strb* s, - gen_kernel_ctx* ctx); -static void appendPrototype (strb* s, - gen_kernel_ctx* ctx); -static void appendIndexDeclarations(strb* s, - gen_kernel_ctx* ctx); -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue); -static void appendRangeCalculations(strb* s, - gen_kernel_ctx* ctx); -static void appendLoops (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopMacroDefs (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopOuter (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopInner (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopMacroUndefs (strb* s, - gen_kernel_ctx* ctx); -static void scheduleMaxAndArgmax (size_t* blockSize, - size_t* gridSize, - const GpuArray* src, - const int* isReduced); -static void invokeMaxAndArgmax (GpuKernel* kernel, - const GpuArray* src, - const int* isReduced); +static int checkargsMaxAndArgmax (GpuArray* dstMax, + GpuArray* dstArgmax, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +static char* genkernelMaxAndArgmax (unsigned numIdx, + unsigned reduxLen, + const unsigned* reduxList, + const char* dstMaxType, + const char* dstArgmaxType); +static void appendKernel (strb* s, + gen_kernel_ctx* ctx); +static void appendTypedefs (strb* s, + gen_kernel_ctx* ctx); +static void appendPrototype (strb* s, + gen_kernel_ctx* ctx); +static void appendOffsets (strb* s, + gen_kernel_ctx* ctx); +static void appendIndexDeclarations(strb* s, + gen_kernel_ctx* ctx); +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue); +static void appendRangeCalculations(strb* s, + gen_kernel_ctx* ctx); +static void appendLoops (strb* s, + gen_kernel_ctx* ctx); +static void appendLoopMacroDefs (strb* s, + gen_kernel_ctx* ctx); +static void appendLoopOuter (strb* s, + gen_kernel_ctx* ctx); +static void appendLoopInner (strb* s, + gen_kernel_ctx* ctx); +static void appendLoopMacroUndefs (strb* s, + gen_kernel_ctx* ctx); +static void scheduleMaxAndArgmax (const GpuKernel* kernel, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList, + size_t* blockSize, + size_t* gridSize, + size_t* chunkSize); +static int invokeMaxAndArgmax (GpuKernel* kernel, + GpuArray* dstMax, + GpuArray* dstArgmax, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); /* Function implementation */ @@ -90,10 +102,24 @@ static void invokeMaxAndArgmax (GpuKernel* kernel, * @param [out] dstMax The resulting tensor of maxima * @param [out] dstArgmax the resulting tensor of arguments at maxima * @param [in] src The source tensor. - * @param [in] isReduced Either NULL, or an array of booleans of the same - * size as the dimensionality of the source tensor. - * Axis k is reduced if isReduced[k] is non-zero, - * and is preserved otherwise. + * @param [in] reduxLen The number of axes reduced. Must be >= 1 and + * <= src->nd. + * @param [in] reduxList A list of integers of length reduxLen, indicating + * the axes to be reduced. The order of the axes + * matters for dstArgmax index calculations. All + * entries in the list must be unique, >= 0 and + * < src->nd. + * + * For example, if a 5D-tensor is reduced with an axis + * list of [3,4,1], then reduxLen shall be 3, and the + * index calculation in every point shall take the form + * + * dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] + + * i4 * src.shape[1] + + * i1 + * + * where (i3,i4,i1) are the coordinates of the maximum- + * valued element within subtensor [i0,:,i2,:,:] of src. * @return GA_NO_ERROR if the operation was successful, or a non-zero error * code otherwise. */ @@ -101,17 +127,29 @@ static void invokeMaxAndArgmax (GpuKernel* kernel, GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, GpuArray* dstArgmax, const GpuArray* src, - const int* isReduced){ + unsigned reduxLen, + const unsigned* reduxList){ + /** + * Sanity check on arguments + */ + + if(!checkargsMaxAndArgmax(dstMax, dstArgmax, src, reduxLen, reduxList)){ + return GA_INVALID_ERROR; + } + + /** * Generate kernel source code. */ + int ret; const char* dstMaxType = gpuarray_get_type(src->typecode) -> cluda_name; const char* dstArgmaxType = gpuarray_get_type(GA_SIZE) -> cluda_name; - const char* s = genkernel_maxandargmax(src->nd, - isReduced, - dstMaxType, - dstArgmaxType); + char* s = genkernelMaxAndArgmax(src->nd, + reduxLen, + reduxList, + dstMaxType, + dstArgmaxType); if(!s){return GA_MEMORY_ERROR;} @@ -124,7 +162,7 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, GA_SIZE, /* srcOff */ GA_BUFFER, /* srcSteps */ GA_BUFFER, /* srcSize */ - GA_BUFFER, /* numBlk */ + GA_BUFFER, /* chnkSize */ GA_BUFFER, /* dstMax */ GA_SIZE, /* dstMaxOff */ GA_BUFFER, /* dstMaxSteps */ @@ -133,56 +171,87 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, GA_BUFFER /* dstArgmaxSteps */ }; - const size_t l = strlen(s); - GpuKernel kernel; - GpuKernel_init(&kernel, 0, 1, &s, &l, "maxandargmax", - 8, ARG_TYPECODE, 0, (char**)0); + const size_t l = strlen(s); + ret = GpuKernel_init(&kernel, 0, 1, (const char**)&s, &l, "maxandargmax", + 11, ARG_TYPECODE, 0, (char**)0); + free(s); + if(ret != GA_NO_ERROR){ + return ret; + } /** * Invoke it. */ - invokeMaxAndArgmax(&kernel, src, isReduced); - - /* Return error code */ - return GA_NO_ERROR; + return invokeMaxAndArgmax(&kernel, dstMax, dstArgmax, src, reduxLen, reduxList); } /** - * Count the number of dimensions to be reduced. + * @brief Check the sanity of the arguments, in agreement with the + * documentation for GpuArray_maxandargmax(). + * + * @param [in] dstMax + * @param [in] dstArgmax + * @param [in] src + * @param [in] reduxLen + * @param [in] reduxList + * @return Zero if arguments invalid; Non-zero otherwise. */ -static int getRdxIdx(const int numIdx, const int* isReduced){ - int i, countReduced; - for(i=0, countReduced = 0;ind == 0 || reduxLen == 0 || reduxLen >= src->nd){ + return 0; + } + + for(i=0;i<(int)reduxLen;i++){ + /* Insane list entry? */ + if(reduxList[i] >= src->nd){ + return 0; + } + + for(j=i-1;j>=0;j--){ + /* Duplicate list entry? */ + if(reduxList[i] == reduxList[j]){ + return 0; + } + } } - return countReduced; + + return 1; } /** * @brief Generate the kernel code for MaxAndArgmax. * * @param [in] numIdx - * @param [in] isReduced + * @param [in] reduxLen + * @param [in] reduxList * @param [in] dstMaxType * @param [in] dstArgmaxType * @return A free()'able string containing source code implementing the * kernel, or else NULL. */ -static char* genkernel_maxandargmax(const int numIdx, - const int* isReduced, +static char* genkernelMaxAndArgmax (unsigned numIdx, + const unsigned reduxLen, + const unsigned* reduxList, const char* dstMaxType, const char* dstArgmaxType){ /* Obtain the parameters of the reduction. */ gen_kernel_ctx ctx; ctx.numIdx = numIdx; - ctx.isReduced = isReduced; - ctx.numRedIdx = getRdxIdx(ctx.numIdx, ctx.isReduced); - ctx.numFreeIdx = ctx.numIdx - ctx.numRedIdx; + ctx.reduxLen = reduxLen; + ctx.reduxList = reduxList; + ctx.numFreeIdx = ctx.numIdx - ctx.reduxLen; ctx.dstMaxType = dstMaxType; ctx.dstArgmaxType = dstArgmaxType; @@ -191,12 +260,12 @@ static char* genkernel_maxandargmax(const int numIdx, appendKernel(&s, &ctx); return strb_cstr(&s); } - static void appendKernel (strb* s, gen_kernel_ctx* ctx){ appendTypedefs (s, ctx); appendPrototype (s, ctx); strb_appends (s, "{\n"); + appendOffsets (s, ctx); appendIndexDeclarations(s, ctx); appendRangeCalculations(s, ctx); appendLoops (s, ctx); @@ -214,12 +283,15 @@ static void appendTypedefs (strb* s, static void appendPrototype (strb* s, gen_kernel_ctx* ctx){ strb_appends(s, "KERNEL void maxandargmax(const T* src,\n"); + strb_appends(s, " const X srcOff,\n"); strb_appends(s, " const X* srcSteps,\n"); strb_appends(s, " const X* srcSize,\n"); - strb_appends(s, " const X* blkNum,\n"); + strb_appends(s, " const X* chunkSize,\n"); strb_appends(s, " T* dstMax,\n"); + strb_appends(s, " const X dstMaxOff,\n"); strb_appends(s, " const X* dstMaxSteps,\n"); strb_appends(s, " X* dstArgmax,\n"); + strb_appends(s, " const X dstArgmaxOff,\n"); strb_appends(s, " const X* dstArgmaxSteps)"); } static void appendOffsets (strb* s, @@ -235,23 +307,24 @@ static void appendIndexDeclarations(strb* s, gen_kernel_ctx* ctx){ strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); - strb_appends(s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); - strb_appends(s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); - strb_appends(s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); + strb_appends(s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); + strb_appends(s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); + strb_appends(s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); + strb_appends(s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); strb_appends(s, "\t\n"); strb_appends(s, "\t\n"); strb_appends(s, "\t/* Free indices & Reduction indices */\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Blk", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Dim", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Start", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "End", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "SStep", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "MStep", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "AStep", ";\n"); - appendIdxes (s, "\tX ", "i", ctx->numFreeIdx, ctx->numIdx, "PDim", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "ChunkSz", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Dim", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Start", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "End", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "SStep", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "MStep", ";\n"); + appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "AStep", ";\n"); + appendIdxes (s, "\tX ", "i", ctx->numFreeIdx, ctx->numIdx, "PDim", ";\n"); strb_appends(s, "\t\n"); strb_appends(s, "\t\n"); @@ -278,24 +351,25 @@ static void appendIdxes (strb* s, } static void appendRangeCalculations(strb* s, gen_kernel_ctx* ctx){ + /* FIXME: Reorder axes. */ int i; strb_appends(s, "\t/* Compute ranges for this thread. */\n"); for(i=0;inumIdx ;i++){/* i*Dim = srcSize[*]; */ - strb_appendf(s, "\ti%dDim = srcSize[%d];\n", i, i); + strb_appendf(s, "\ti%dDim = srcSize[%d];\n", i, i); } for(i=0;inumIdx ;i++){/* i*SStep = srcSteps[*]; */ - strb_appendf(s, "\ti%dSStep = srcSteps[%d];\n", i, i); + strb_appendf(s, "\ti%dSStep = srcSteps[%d];\n", i, i); } for(i=0;inumFreeIdx;i++){/* i*MStep = dstMaxSteps[*]; */ - strb_appendf(s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); + strb_appendf(s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); } for(i=0;inumFreeIdx;i++){/* i*AStep = dstArgmaxSteps[*]; */ - strb_appendf(s, "\ti%dMStep = dstArgmaxSteps[%d];\n", i, i); + strb_appendf(s, "\ti%dMStep = dstArgmaxSteps[%d];\n", i, i); } - for(i=0;inumIdx ;i++){/* i*Blk = numBlk[*]; */ - strb_appendf(s, "\ti%dBlk = numBlk[%d];\n", i, i); + for(i=0;inumIdx ;i++){/* i*ChunkSz = numBlk[*]; */ + strb_appendf(s, "\ti%dChunkSz = chunkSize[%d];\n", i, i); } for(i=ctx->numIdx-1;i>=ctx->numFreeIdx;i--){/* i*PDim = ...; */ /** @@ -316,7 +390,7 @@ static void appendRangeCalculations(strb* s, */ if(i < 3){ - strb_appendf(s, "\ti%dStart = ((bi%d * bd%d) + ti%d) * i%dBlk;\n", i, i, i, i, i); + strb_appendf(s, "\ti%dStart = gi%d * i%dChunkSz;\n", i, i, i); }else{ strb_appendf(s, "\ti%dStart = 0;\n", i); } @@ -328,7 +402,7 @@ static void appendRangeCalculations(strb* s, */ if(i < 3){ - strb_appendf(s, "\ti%dEnd = i%dStart + bd%d * i%dBlk;\n", i, i, i, i); + strb_appendf(s, "\ti%dEnd = i%dStart + bd%d * i%dChunkSz;\n", i, i, i, i); }else{ strb_appendf(s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } @@ -444,7 +518,7 @@ static void appendLoopInner (strb* s, strb_appends(s, "\t\n"); appendIdxes (s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->numFreeIdx, "", ""); - if(ctx->numFreeIdx && ctx->numRedIdx){strb_appends(s, ",");} + if(ctx->numFreeIdx && ctx->reduxLen){strb_appends(s, ",");} appendIdxes (s, "", "i", ctx->numFreeIdx, ctx->numIdx, "Start", ");\n"); appendIdxes (s, "\tX maxI = RDXINDEXER(", "i", ctx->numFreeIdx, ctx->numIdx, "Start", ");\n"); @@ -505,14 +579,16 @@ static void appendLoopMacroUndefs (strb* s, } /** - * Compute a good thread block size / grid size for Nvidia. + * Compute a good thread block size / grid size / software chunk size for Nvidia. */ static void scheduleMaxAndArgmax (const GpuKernel* kernel, const GpuArray* src, - const int* isReduced, + unsigned reduxLen, + const unsigned* reduxList, size_t* blockSize, - size_t* gridSize){ + size_t* gridSize, + size_t* chunkSize){ int i, j; /* Obtain the constraints of our problem. */ @@ -529,8 +605,9 @@ static void scheduleMaxAndArgmax (const GpuKernel* kernel, gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); - int numRdxIdx = getRdxIdx(src->nd, isReduced); + int numRdxIdx = reduxLen; int numFreeIdx = src->nd - numRdxIdx; + (void)numFreeIdx; /** * Select which reduction dimensions will be associated with which hardware @@ -541,11 +618,12 @@ static void scheduleMaxAndArgmax (const GpuKernel* kernel, uint64_t dimSize [3] = { 1, 1, 1}; double slack [3] = {1.1, 1.1, 1.1}; uint64_t kSmooth [3]; - GA_FACTOR_LIST factDims[3]; - GA_FACTOR_LIST factTBS [3]; + ga_factor_list factDims[3]; + ga_factor_list factTBS [3]; uint64_t tBS = 1; uint64_t minThrd = 64; uint64_t maxThrd = 256; + (void)dims; /************************************************************************ * FIXME: Need logic to select up to 3 dimensions and plug them in dimSize! @@ -618,18 +696,109 @@ static void scheduleMaxAndArgmax (const GpuKernel* kernel, * Invoke the kernel. */ -static void invokeMaxAndArgmax (GpuKernel* k, +static int invokeMaxAndArgmax (GpuKernel* k, + GpuArray* dstMax, + GpuArray* dstArgmax, const GpuArray* src, - const int* isReduced){ - size_t blockSize[3]; - size_t gridSize[3]; - - scheduleMaxAndArgmax(k, src, isReduced, blockSize, gridSize); - GpuKernel_call(k, - getRdxIdx(src->nd, isReduced), - blockSize, - gridSize, - 0, - NULL); + unsigned reduxLen, + const unsigned* reduxList){ + int ret; + size_t blockSize[3] = {1,1,1}; + size_t gridSize [3] = {1,1,1}; + size_t chunkSize[3] = {1,1,1}; + gpudata* srcStepsGD = 0, *srcSizeGD = 0, *chunkSizeGD = 0, + *dstMaxStepsGD = 0, *dstArgmaxStepsGD = 0; + gpucontext* ctx = GpuArray_context(src); + + + /** + * Schedule the kernel. + * + * This implies choosing the block, grid and chunk size appropriately. + */ + + scheduleMaxAndArgmax(k, src, reduxLen, reduxList, + blockSize, gridSize, chunkSize); + + + /** + * Argument Marshalling. This the grossest gross thing in here. + */ + + srcStepsGD = gpudata_alloc(ctx, src->nd * sizeof(size_t), + src->strides, + GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); + if(ret){goto releaseGpudata;} + srcSizeGD = gpudata_alloc(ctx, src->nd * sizeof(size_t), + src->dimensions, + GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); + if(ret){goto releaseGpudata;} + chunkSizeGD = gpudata_alloc(ctx, src->nd * sizeof(size_t), + chunkSize, + GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); + if(ret){goto releaseGpudata;} + dstMaxStepsGD = gpudata_alloc(ctx, (src->nd - reduxLen) * sizeof(size_t), + dstMax->strides, + GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); + if(ret){goto releaseGpudata;} + dstArgmaxStepsGD = gpudata_alloc(ctx, (src->nd - reduxLen) * sizeof(size_t), + dstArgmax->strides, + GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); + if(ret){goto releaseGpudata;} + + + struct MaxAndArgmaxArgs{ + gpudata* src; + size_t srcOff; + gpudata* srcSteps; + gpudata* srcSize; + gpudata* chunkSize; + gpudata* dstMax; + size_t dstMaxOff; + gpudata* dstMaxSteps; + gpudata* dstArgmax; + size_t dstArgmaxOff; + gpudata* dstArgmaxSteps; + } argstr = { + src->data, + src->offset, + srcStepsGD, + srcSizeGD, + chunkSizeGD, + dstMax->data, + dstMax->offset, + dstMaxStepsGD, + dstArgmax->data, + dstArgmax->offset, + dstArgmaxStepsGD + }; + + void* args[] = { + (void*)&argstr.src, + (void*)&argstr.srcOff, + (void*)&argstr.srcSteps, + (void*)&argstr.srcSize, + (void*)&argstr.chunkSize, + (void*)&argstr.dstMax, + (void*)&argstr.dstMaxOff, + (void*)&argstr.dstMaxSteps, + (void*)&argstr.dstArgmax, + (void*)&argstr.dstArgmaxOff, + (void*)&argstr.dstArgmaxSteps + }; + + + /** + * Call kernel, release arguments and return error code + */ + + ret = GpuKernel_call(k, 3, blockSize, gridSize, 0, args); + releaseGpudata: + gpudata_release(srcStepsGD); + gpudata_release(srcSizeGD); + gpudata_release(chunkSizeGD); + gpudata_release(dstMaxStepsGD); + gpudata_release(dstArgmaxStepsGD); + return ret; } From 101296a241353334f3642aaaf5d59b63ccda2ea5 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 30 Aug 2016 14:31:53 -0400 Subject: [PATCH 028/597] Proper axis reordering support. The code generator now supports properly usecases in which the reduction axes were not the last k axes out of n, and also allows arbitrary ordering of the reduction indices, which can affect the calculation of the argmax index. Also, some style and alignment fixes. --- src/gpuarray_reduction.c | 147 +++++++++++++++++++++++++-------------- 1 file changed, 95 insertions(+), 52 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index f78c8677e9..fa2c052607 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -25,8 +25,9 @@ struct gen_kernel_ctx{ unsigned numIdx; unsigned reduxLen; - const unsigned* reduxList; + const unsigned* reduxList; unsigned numFreeIdx; + unsigned* axisList; const char* dstMaxType; const char* dstArgmaxType; }; @@ -74,6 +75,10 @@ static void appendLoopInner (strb* s, gen_kernel_ctx* ctx); static void appendLoopMacroUndefs (strb* s, gen_kernel_ctx* ctx); +static void computeAxisList (unsigned* axisList, + unsigned numAxis, + const unsigned* reduxList, + unsigned reduxLen); static void scheduleMaxAndArgmax (const GpuKernel* kernel, const GpuArray* src, unsigned reduxLen, @@ -200,15 +205,16 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, * @return Zero if arguments invalid; Non-zero otherwise. */ -static int checkargsMaxAndArgmax (GpuArray* dstMax, - GpuArray* dstArgmax, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ +static int checkargsMaxAndArgmax (GpuArray* dstMax, + GpuArray* dstArgmax, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ int i, j; /* Insane src or reduxLen? */ - if(!src || src->nd == 0 || reduxLen == 0 || reduxLen >= src->nd){ + if(!dstMax || !dstArgmax || !src || src->nd == 0 || reduxLen == 0 || + reduxLen >= src->nd){ return 0; } @@ -241,27 +247,38 @@ static int checkargsMaxAndArgmax (GpuArray* dstMax, * kernel, or else NULL. */ -static char* genkernelMaxAndArgmax (unsigned numIdx, - const unsigned reduxLen, - const unsigned* reduxList, - const char* dstMaxType, - const char* dstArgmaxType){ - /* Obtain the parameters of the reduction. */ +static char* genkernelMaxAndArgmax (unsigned numIdx, + const unsigned reduxLen, + const unsigned* reduxList, + const char* dstMaxType, + const char* dstArgmaxType){ + /* Save the parameters of the reduction in a generator context. */ gen_kernel_ctx ctx; ctx.numIdx = numIdx; ctx.reduxLen = reduxLen; ctx.reduxList = reduxList; ctx.numFreeIdx = ctx.numIdx - ctx.reduxLen; + ctx.axisList = malloc(numIdx*sizeof(unsigned)); ctx.dstMaxType = dstMaxType; ctx.dstArgmaxType = dstArgmaxType; + if(!ctx.axisList){ + return NULL; + } + + /* Compute internal axis remapping. */ + computeAxisList(ctx.axisList, ctx.numIdx, ctx.reduxList, ctx.reduxLen); + /* Generate kernel proper. */ strb s = STRB_STATIC_INIT; strb_ensure(&s, 5*1024); appendKernel(&s, &ctx); + free(ctx.axisList); + + /* Return it. */ return strb_cstr(&s); } -static void appendKernel (strb* s, - gen_kernel_ctx* ctx){ +static void appendKernel (strb* s, + gen_kernel_ctx* ctx){ appendTypedefs (s, ctx); appendPrototype (s, ctx); strb_appends (s, "{\n"); @@ -271,8 +288,8 @@ static void appendKernel (strb* s, appendLoops (s, ctx); strb_appends (s, "}\n"); } -static void appendTypedefs (strb* s, - gen_kernel_ctx* ctx){ +static void appendTypedefs (strb* s, + gen_kernel_ctx* ctx){ strb_appends(s, "/* Typedefs */\n"); strb_appendf(s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); strb_appendf(s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); @@ -280,8 +297,8 @@ static void appendTypedefs (strb* s, strb_appends(s, "\n"); strb_appends(s, "\n"); } -static void appendPrototype (strb* s, - gen_kernel_ctx* ctx){ +static void appendPrototype (strb* s, + gen_kernel_ctx* ctx){ strb_appends(s, "KERNEL void maxandargmax(const T* src,\n"); strb_appends(s, " const X srcOff,\n"); strb_appends(s, " const X* srcSteps,\n"); @@ -294,8 +311,8 @@ static void appendPrototype (strb* s, strb_appends(s, " const X dstArgmaxOff,\n"); strb_appends(s, " const X* dstArgmaxSteps)"); } -static void appendOffsets (strb* s, - gen_kernel_ctx* ctx){ +static void appendOffsets (strb* s, + gen_kernel_ctx* ctx){ strb_appends(s, "/* Add offsets */\n"); strb_appends(s, "src = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); strb_appends(s, "dstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); @@ -303,8 +320,8 @@ static void appendOffsets (strb* s, strb_appends(s, "\n"); strb_appends(s, "\n"); } -static void appendIndexDeclarations(strb* s, - gen_kernel_ctx* ctx){ +static void appendIndexDeclarations(strb* s, + gen_kernel_ctx* ctx){ strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); strb_appends(s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); @@ -329,13 +346,13 @@ static void appendIndexDeclarations(strb* s, strb_appends(s, "\t\n"); strb_appends(s, "\t\n"); } -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue){ +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue){ int i; prologue = prologue ? prologue : ""; @@ -349,18 +366,18 @@ static void appendIdxes (strb* s, } strb_appends(s, epilogue); } -static void appendRangeCalculations(strb* s, - gen_kernel_ctx* ctx){ - /* FIXME: Reorder axes. */ +static void appendRangeCalculations(strb* s, + gen_kernel_ctx* ctx){ int i; + /* Use internal remapping when computing the ranges for this thread. */ strb_appends(s, "\t/* Compute ranges for this thread. */\n"); for(i=0;inumIdx ;i++){/* i*Dim = srcSize[*]; */ - strb_appendf(s, "\ti%dDim = srcSize[%d];\n", i, i); + strb_appendf(s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); } for(i=0;inumIdx ;i++){/* i*SStep = srcSteps[*]; */ - strb_appendf(s, "\ti%dSStep = srcSteps[%d];\n", i, i); + strb_appendf(s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); } for(i=0;inumFreeIdx;i++){/* i*MStep = dstMaxSteps[*]; */ strb_appendf(s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); @@ -369,7 +386,7 @@ static void appendRangeCalculations(strb* s, strb_appendf(s, "\ti%dMStep = dstArgmaxSteps[%d];\n", i, i); } for(i=0;inumIdx ;i++){/* i*ChunkSz = numBlk[*]; */ - strb_appendf(s, "\ti%dChunkSz = chunkSize[%d];\n", i, i); + strb_appendf(s, "\ti%dChunkSz = chunkSize[%d];\n", i, ctx->axisList[i]); } for(i=ctx->numIdx-1;i>=ctx->numFreeIdx;i--){/* i*PDim = ...; */ /** @@ -411,8 +428,8 @@ static void appendRangeCalculations(strb* s, strb_appends(s, "\t\n"); strb_appends(s, "\t\n"); } -static void appendLoops (strb* s, - gen_kernel_ctx* ctx){ +static void appendLoops (strb* s, + gen_kernel_ctx* ctx){ strb_appends(s, "\t/**\n"); strb_appends(s, "\t * FREE LOOPS.\n"); strb_appends(s, "\t */\n"); @@ -422,8 +439,8 @@ static void appendLoops (strb* s, appendLoopOuter (s, ctx); appendLoopMacroUndefs(s, ctx); } -static void appendLoopMacroDefs (strb* s, - gen_kernel_ctx* ctx){ +static void appendLoopMacroDefs (strb* s, + gen_kernel_ctx* ctx){ int i; /** @@ -478,8 +495,8 @@ static void appendLoopMacroDefs (strb* s, } strb_appends(s, "0]\n"); } -static void appendLoopOuter (strb* s, - gen_kernel_ctx* ctx){ +static void appendLoopOuter (strb* s, + gen_kernel_ctx* ctx){ int i; /** @@ -504,8 +521,8 @@ static void appendLoopOuter (strb* s, strb_appends(s, "\t}\n"); } } -static void appendLoopInner (strb* s, - gen_kernel_ctx* ctx){ +static void appendLoopInner (strb* s, + gen_kernel_ctx* ctx){ int i; /** @@ -568,8 +585,8 @@ static void appendLoopInner (strb* s, appendIdxes (s, "\tDSTMINDEXER(", "i", 0, ctx->numFreeIdx, "", ") = maxV;\n"); appendIdxes (s, "\tDSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") = maxI;\n"); } -static void appendLoopMacroUndefs (strb* s, - gen_kernel_ctx* ctx){ +static void appendLoopMacroUndefs (strb* s, + gen_kernel_ctx* ctx){ strb_appends(s, "\t#undef FOROVER\n"); strb_appends(s, "\t#undef ESCAPE\n"); strb_appends(s, "\t#undef SRCINDEXER\n"); @@ -577,6 +594,32 @@ static void appendLoopMacroUndefs (strb* s, strb_appends(s, "\t#undef DSTMINDEXER\n"); strb_appends(s, "\t#undef DSTAINDEXER\n"); } +static void computeAxisList (unsigned* axisList, + unsigned numAxis, + const unsigned* reduxList, + unsigned reduxLen){ + unsigned i, j, f=0, r=numAxis-reduxLen; + + for(i=0;i Date: Mon, 19 Sep 2016 11:15:10 -0400 Subject: [PATCH 029/597] Added check_reduction test. Also moved the doc and prototype of GpuArray_maxandargmax() to gpuarray/ array.h. --- src/gpuarray/array.h | 39 ++++++++++++ tests/CMakeLists.txt | 4 ++ tests/check_reduction.c | 138 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 181 insertions(+) create mode 100644 tests/check_reduction.c diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index 8cbad94098..aabf14ee02 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -607,6 +607,45 @@ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a); GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a); +/** + * @brief Computes simultaneously the maxima and the arguments of maxima over + * specified axes of the tensor. + * + * Returns two tensors of identical shape. Both tensors' axes are a subset of + * the axes of the original tensor. The axes to be reduced are specified by + * the caller, and the maxima and arguments of maxima are computed over them. + * + * @param [out] dstMax The resulting tensor of maxima + * @param [out] dstArgmax the resulting tensor of arguments at maxima + * @param [in] src The source tensor. + * @param [in] reduxLen The number of axes reduced. Must be >= 1 and + * <= src->nd. + * @param [in] reduxList A list of integers of length reduxLen, indicating + * the axes to be reduced. The order of the axes + * matters for dstArgmax index calculations. All + * entries in the list must be unique, >= 0 and + * < src->nd. + * + * For example, if a 5D-tensor is reduced with an axis + * list of [3,4,1], then reduxLen shall be 3, and the + * index calculation in every point shall take the form + * + * dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] + + * i4 * src.shape[1] + + * i1 + * + * where (i3,i4,i1) are the coordinates of the maximum- + * valued element within subtensor [i0,:,i2,:,:] of src. + * @return GA_NO_ERROR if the operation was successful, or a non-zero error + * code otherwise. + */ + +GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, + GpuArray* dstArgmax, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); + #ifdef __cplusplus } #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f7ccbae8b8..0bbf109b05 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -52,6 +52,10 @@ add_executable(check_util_integerfactoring main.c check_util_integerfactoring.c) target_link_libraries(check_util_integerfactoring ${LIBS} gpuarray-static) add_test(test_util_integerfactoring ${CMAKE_CURRENT_BINARY_DIR}/check_util_integerfactoring) +add_executable(check_reduction main.c device.c check_reduction.c) +target_link_libraries(check_reduction ${LIBS} gpuarray) +add_test(test_reduction ${CMAKE_CURRENT_BINARY_DIR}/check_reduction) + add_executable(check_array main.c device.c check_array.c) target_link_libraries(check_array ${CHECK_LIBRARIES} gpuarray) add_test(test_array "${CMAKE_CURRENT_BINARY_DIR}/check_array") diff --git a/tests/check_reduction.c b/tests/check_reduction.c new file mode 100644 index 0000000000..a522e27eaa --- /dev/null +++ b/tests/check_reduction.c @@ -0,0 +1,138 @@ +#include + +#include +#include +#include +#include + +#include +#include + + +extern void *ctx; + +void setup(void); +void teardown(void); + + +/* Defines */ +#define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) + + + + +/** + * PRNG based on PCG XSH RR 64/32 (LCG) + * + * Used to generate random data for the kernel tests. + */ + +/* Forward Declarations */ +static uint32_t pcgRor32 (uint32_t x, uint32_t n); +static void pcgSeed (uint64_t seed); +static uint32_t pcgRand (void); +static double pcgRand01(void); +/* Definitions */ +static uint64_t pcgS = 1;/* State */ +static const uint64_t pcgM = 6364136223846793005;/* Multiplier */ +static const uint64_t pcgA = 1442695040888963407;/* Addend */ +static uint32_t pcgRor32 (uint32_t x, uint32_t n){ + return (n &= 0x1F) ? x>>n | x<<(32-n) : x; +} +static void pcgSeed (uint64_t seed){ + pcgS = seed; +} +static uint32_t pcgRand (void){ + pcgS = pcgS*pcgM + pcgA; + + /** + * PCG does something akin to an unbalanced Feistel round to blind the LCG + * state: + * + * The rightmost 59 bits are involved in an xorshift by 18. + * The leftmost 5 bits select a rotation of the 32 bits 58:27. + */ + + return pcgRor32((pcgS^(pcgS>>18))>>27, pcgS>>59); +} +static double pcgRand01(void){ + uint64_t u = pcgRand(), l = pcgRand(); + uint64_t x = u<<32 | l; + return x /18446744073709551616.0; +} + + + +START_TEST(test_reduction){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on the first and + * third dimensions. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + + float* pSrc = malloc(sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMax = malloc(sizeof(*pMax) * dims[1] ); + size_t* pArgmax = malloc(sizeof(*pArgmax) * dims[1] ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = i*dims[1]*dims[2] + k; + } + } + } + + ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); + ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); + } +}END_TEST + +Suite *get_suite(void) { + Suite *s = suite_create("reduction"); + TCase *tc = tcase_create("basic"); + tcase_add_checked_fixture(tc, setup, teardown); + tcase_set_timeout(tc, 8.0); + + tcase_add_test(tc, test_reduction); + + suite_add_tcase(s, tc); + return s; +} + From 08d897f48068a358af2f33a343b896c44e15badd Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 20 Sep 2016 03:17:36 -0400 Subject: [PATCH 030/597] Bugfixes related to test. Currently one remaining segfault, in kernel execution. --- src/gpuarray_reduction.c | 41 ++++------------------------------------ tests/check_reduction.c | 1 + 2 files changed, 5 insertions(+), 37 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index fa2c052607..a818f173ad 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -95,40 +95,6 @@ static int invokeMaxAndArgmax (GpuKernel* kernel, /* Function implementation */ - -/** - * @brief Computes simultaneously the maxima and the arguments of maxima over - * specified axes of the tensor. - * - * Returns two tensors of identical shape. Both tensors' axes are a subset of - * the axes of the original tensor. The axes to be reduced are specified by - * the caller, and the maxima and arguments of maxima are computed over them. - * - * @param [out] dstMax The resulting tensor of maxima - * @param [out] dstArgmax the resulting tensor of arguments at maxima - * @param [in] src The source tensor. - * @param [in] reduxLen The number of axes reduced. Must be >= 1 and - * <= src->nd. - * @param [in] reduxList A list of integers of length reduxLen, indicating - * the axes to be reduced. The order of the axes - * matters for dstArgmax index calculations. All - * entries in the list must be unique, >= 0 and - * < src->nd. - * - * For example, if a 5D-tensor is reduced with an axis - * list of [3,4,1], then reduxLen shall be 3, and the - * index calculation in every point shall take the form - * - * dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] + - * i4 * src.shape[1] + - * i1 - * - * where (i3,i4,i1) are the coordinates of the maximum- - * valued element within subtensor [i0,:,i2,:,:] of src. - * @return GA_NO_ERROR if the operation was successful, or a non-zero error - * code otherwise. - */ - GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, GpuArray* dstArgmax, const GpuArray* src, @@ -178,8 +144,9 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, GpuKernel kernel; const size_t l = strlen(s); - ret = GpuKernel_init(&kernel, 0, 1, (const char**)&s, &l, "maxandargmax", - 11, ARG_TYPECODE, 0, (char**)0); + ret = GpuKernel_init(&kernel, gpudata_context(src->data), + 1, (const char**)&s, &l, "maxandargmax", + 11, ARG_TYPECODE, GA_USE_CLUDA, (char**)0); free(s); if(ret != GA_NO_ERROR){ return ret; @@ -745,7 +712,7 @@ static int invokeMaxAndArgmax (GpuKernel* k, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ - int ret; + int ret = 0; size_t blockSize[3] = {1,1,1}; size_t gridSize [3] = {1,1,1}; size_t chunkSize[3] = {1,1,1}; diff --git a/tests/check_reduction.c b/tests/check_reduction.c index a522e27eaa..a26a369dac 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -7,6 +7,7 @@ #include #include +#include extern void *ctx; From 2153b0b83cb3899a053290d4ad303d707c712123 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 27 Sep 2016 04:27:31 -0400 Subject: [PATCH 031/597] Bugfixes and refactoring all around. Test-case now runs without segfaulting and passes with any valid static scheduling. --- src/gpuarray_reduction.c | 340 ++++++++++++++++++++++++--------------- tests/check_reduction.c | 112 ++++++++++++- 2 files changed, 316 insertions(+), 136 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index a818f173ad..752a83498b 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -26,6 +26,8 @@ struct gen_kernel_ctx{ unsigned numIdx; unsigned reduxLen; const unsigned* reduxList; + unsigned hwAxisLen; + const unsigned* hwAxisList; unsigned numFreeIdx; unsigned* axisList; const char* dstMaxType; @@ -36,14 +38,25 @@ typedef struct gen_kernel_ctx gen_kernel_ctx; /* Function prototypes */ +static int axisInSet (unsigned v, + const unsigned* set, + size_t setLen, + size_t* where); static int checkargsMaxAndArgmax (GpuArray* dstMax, GpuArray* dstArgmax, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList); -static char* genkernelMaxAndArgmax (unsigned numIdx, +static void selectHwAxes (unsigned* hwAxesLen, + unsigned* hwAxisList, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +static char* gensourceMaxAndArgmax (unsigned numIdx, unsigned reduxLen, const unsigned* reduxList, + const unsigned hwAxisLen, + const unsigned* hwAxisList, const char* dstMaxType, const char* dstArgmaxType); static void appendKernel (strb* s, @@ -79,6 +92,9 @@ static void computeAxisList (unsigned* axisList, unsigned numAxis, const unsigned* reduxList, unsigned reduxLen); +static int compileMaxAndArgmax (GpuKernel* kernel, + const char* src, + gpucontext* ctx); static void scheduleMaxAndArgmax (const GpuKernel* kernel, const GpuArray* src, unsigned reduxLen, @@ -91,7 +107,9 @@ static int invokeMaxAndArgmax (GpuKernel* kernel, GpuArray* dstArgmax, const GpuArray* src, unsigned reduxLen, - const unsigned* reduxList); + const unsigned* reduxList, + unsigned hwAxisLen, + const unsigned* hwAxisList); /* Function implementation */ @@ -100,69 +118,74 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ - /** - * Sanity check on arguments - */ - + int ret = 0; + unsigned hwAxisLen = 0; + unsigned hwAxisList[3] = {0,0,0}; + const char* dstMaxType = NULL; + const char* dstArgmaxType = NULL; + char* s = NULL; + GpuKernel kernel; + + /* Sanity-check arguments */ if(!checkargsMaxAndArgmax(dstMax, dstArgmax, src, reduxLen, reduxList)){ return GA_INVALID_ERROR; } + /* Select hardware axis mapping */ + selectHwAxes(&hwAxisLen, hwAxisList, src, reduxLen, reduxList); - /** - * Generate kernel source code. - */ - - int ret; - const char* dstMaxType = gpuarray_get_type(src->typecode) -> cluda_name; - const char* dstArgmaxType = gpuarray_get_type(GA_SIZE) -> cluda_name; - char* s = genkernelMaxAndArgmax(src->nd, - reduxLen, - reduxList, - dstMaxType, - dstArgmaxType); + /* Generate kernel source code */ + dstMaxType = gpuarray_get_type(src->typecode)->cluda_name; + dstArgmaxType = gpuarray_get_type(GA_SSIZE) ->cluda_name; + s = gensourceMaxAndArgmax(src->nd, + reduxLen, reduxList, + hwAxisLen, hwAxisList, + dstMaxType, dstArgmaxType); if(!s){return GA_MEMORY_ERROR;} - - /** - * Compile it. - */ - - const int ARG_TYPECODE[11] = { - GA_BUFFER, /* src */ - GA_SIZE, /* srcOff */ - GA_BUFFER, /* srcSteps */ - GA_BUFFER, /* srcSize */ - GA_BUFFER, /* chnkSize */ - GA_BUFFER, /* dstMax */ - GA_SIZE, /* dstMaxOff */ - GA_BUFFER, /* dstMaxSteps */ - GA_BUFFER, /* dstArgmax */ - GA_SIZE, /* dstArgmaxOff */ - GA_BUFFER /* dstArgmaxSteps */ - }; - - GpuKernel kernel; - const size_t l = strlen(s); - ret = GpuKernel_init(&kernel, gpudata_context(src->data), - 1, (const char**)&s, &l, "maxandargmax", - 11, ARG_TYPECODE, GA_USE_CLUDA, (char**)0); + /* Compile kernel source code */ + ret = compileMaxAndArgmax(&kernel, s, gpudata_context(src->data)); free(s); if(ret != GA_NO_ERROR){ return ret; } + /* Invoke compiled kernel */ + return invokeMaxAndArgmax(&kernel, dstMax, dstArgmax, src, + reduxLen, reduxList, hwAxisLen, hwAxisList); +} + +/** + * @brief Check whether axis numbered v is already in the given set of axes. + * + * @param [in] v + * @param [in] set + * @param [in] setLen + * @param [out] where + * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. + */ + +static int axisInSet (unsigned v, + const unsigned* set, + size_t setLen, + size_t* where){ + size_t i; + + for(i=0;ind == 0 || reduxLen == 0 || @@ -186,20 +209,51 @@ static int checkargsMaxAndArgmax (GpuArray* dstMax, } for(i=0;i<(int)reduxLen;i++){ - /* Insane list entry? */ - if(reduxList[i] >= src->nd){ + /* Insane or duplicate list entry? */ + if(reduxList[i] >= src->nd || + axisInSet(reduxList[i], reduxList, i, 0)){ return 0; } + } + + return 1; +} + +/** + * @brief Select which axes (up to 3) will be assigned to hardware + * dimensions. + * + * @param [out] hwAxisLen + * @param [out] hwAxisList + * @param [in] src + * @param [in] reduxLen + * @param [in] reduxList + */ + +static void selectHwAxes (unsigned* hwAxisLen, + unsigned* hwAxisList, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + unsigned i, j; + + *hwAxisLen = src->nd-reduxLen < 3 ? src->nd-reduxLen : 3; + + for(i=0;i<*hwAxisLen;i++){ + size_t maxV = 0; + unsigned maxI = 0; - for(j=i-1;j>=0;j--){ - /* Duplicate list entry? */ - if(reduxList[i] == reduxList[j]){ - return 0; + for(j=0;jnd;j++){ + if(!axisInSet(j, hwAxisList, i, 0) && + !axisInSet(j, reduxList, reduxLen, 0) && + src->dimensions[j] > maxV){ + maxV = src->dimensions[j]; + maxI = j; } } + + hwAxisList[i] = maxI; } - - return 1; } /** @@ -208,15 +262,19 @@ static int checkargsMaxAndArgmax (GpuArray* dstMax, * @param [in] numIdx * @param [in] reduxLen * @param [in] reduxList + * @param [in] hwAxisLen + * @param [in] hwAxisList * @param [in] dstMaxType * @param [in] dstArgmaxType * @return A free()'able string containing source code implementing the * kernel, or else NULL. */ -static char* genkernelMaxAndArgmax (unsigned numIdx, +static char* gensourceMaxAndArgmax (unsigned numIdx, const unsigned reduxLen, const unsigned* reduxList, + const unsigned hwAxisLen, + const unsigned* hwAxisList, const char* dstMaxType, const char* dstArgmaxType){ /* Save the parameters of the reduction in a generator context. */ @@ -224,6 +282,8 @@ static char* genkernelMaxAndArgmax (unsigned numIdx, ctx.numIdx = numIdx; ctx.reduxLen = reduxLen; ctx.reduxList = reduxList; + ctx.hwAxisLen = hwAxisLen; + ctx.hwAxisList = hwAxisList; ctx.numFreeIdx = ctx.numIdx - ctx.reduxLen; ctx.axisList = malloc(numIdx*sizeof(unsigned)); ctx.dstMaxType = dstMaxType; @@ -280,27 +340,32 @@ static void appendPrototype (strb* s, } static void appendOffsets (strb* s, gen_kernel_ctx* ctx){ - strb_appends(s, "/* Add offsets */\n"); - strb_appends(s, "src = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); - strb_appends(s, "dstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); - strb_appends(s, "dstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); - strb_appends(s, "\n"); - strb_appends(s, "\n"); + strb_appends(s, "\t/* Add offsets */\n"); + strb_appends(s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); + strb_appends(s, "\tdstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); + strb_appends(s, "\tdstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); + strb_appends(s, "\t\n"); + strb_appends(s, "\t\n"); } static void appendIndexDeclarations(strb* s, gen_kernel_ctx* ctx){ + unsigned i; strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); - strb_appends(s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); - strb_appends(s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); - strb_appends(s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); - strb_appends(s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); + strb_appends(s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); + strb_appends(s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); + strb_appends(s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); + strb_appends(s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); + strb_appends(s, "\tX "); + for(i=0;ihwAxisLen;i++){ + strb_appendf(s, "ci%u = chunkSize[%u]%s", + i, i, (i==ctx->hwAxisLen-1) ? ";\n" : ", "); + } strb_appends(s, "\t\n"); strb_appends(s, "\t\n"); strb_appends(s, "\t/* Free indices & Reduction indices */\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "ChunkSz", ";\n"); appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "", ";\n"); appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Dim", ";\n"); appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Start", ";\n"); @@ -335,7 +400,8 @@ static void appendIdxes (strb* s, } static void appendRangeCalculations(strb* s, gen_kernel_ctx* ctx){ - int i; + size_t hwDim; + int i; /* Use internal remapping when computing the ranges for this thread. */ strb_appends(s, "\t/* Compute ranges for this thread. */\n"); @@ -350,10 +416,7 @@ static void appendRangeCalculations(strb* s, strb_appendf(s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); } for(i=0;inumFreeIdx;i++){/* i*AStep = dstArgmaxSteps[*]; */ - strb_appendf(s, "\ti%dMStep = dstArgmaxSteps[%d];\n", i, i); - } - for(i=0;inumIdx ;i++){/* i*ChunkSz = numBlk[*]; */ - strb_appendf(s, "\ti%dChunkSz = chunkSize[%d];\n", i, ctx->axisList[i]); + strb_appendf(s, "\ti%dAStep = dstArgmaxSteps[%d];\n", i, i); } for(i=ctx->numIdx-1;i>=ctx->numFreeIdx;i--){/* i*PDim = ...; */ /** @@ -362,33 +425,33 @@ static void appendRangeCalculations(strb* s, */ if(i == ctx->numIdx-1){ - strb_appendf(s, "\ti%dPDim = 1;\n", i); + strb_appendf(s, "\ti%dPDim = 1;\n", i); }else{ - strb_appendf(s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i); + strb_appendf(s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); } } for(i=0;inumIdx ;i++){/* i*Start = ...; */ /** - * The first 3 dimensions get to rely on hardware loops. + * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if(i < 3){ - strb_appendf(s, "\ti%dStart = gi%d * i%dChunkSz;\n", i, i, i); + if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->hwAxisLen, &hwDim)){ + strb_appendf(s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ - strb_appendf(s, "\ti%dStart = 0;\n", i); + strb_appendf(s, "\ti%dStart = 0;\n", i); } } for(i=0;inumIdx ;i++){/* i*End = ...; */ /** - * The first 3 dimensions get to rely on hardware loops. + * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if(i < 3){ - strb_appendf(s, "\ti%dEnd = i%dStart + bd%d * i%dChunkSz;\n", i, i, i, i); + if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->hwAxisLen, &hwDim)){ + strb_appendf(s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ - strb_appendf(s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); + strb_appendf(s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } } @@ -426,11 +489,11 @@ static void appendLoopMacroDefs (strb* s, * SRCINDEXER Macro */ - appendIdxes (s, "#define SRCINDEXER(", "i", 0, ctx->numIdx, "", ") src["); + appendIdxes (s, "#define SRCINDEXER(", "i", 0, ctx->numIdx, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); for(i=0;inumIdx;i++){ strb_appendf(s, "i%d*i%dSStep + \\\n ", i, i); } - strb_appends(s, "0]\n"); + strb_appends(s, "0))\n"); /** * RDXINDEXER Macro @@ -446,21 +509,21 @@ static void appendLoopMacroDefs (strb* s, * DSTMINDEXER Macro */ - appendIdxes (s, "#define DSTMINDEXER(", "i", 0, ctx->numFreeIdx, "", ") dstMax["); + appendIdxes (s, "#define DSTMINDEXER(", "i", 0, ctx->numFreeIdx, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); for(i=0;inumFreeIdx;i++){ strb_appendf(s, "i%d*i%dMStep + \\\n ", i, i); } - strb_appends(s, "0]\n"); + strb_appends(s, "0))\n"); /** * DSTAINDEXER Macro */ - appendIdxes (s, "#define DSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") dstArgmax["); + appendIdxes (s, "#define DSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); for(i=0;inumFreeIdx;i++){ strb_appendf(s, "i%d*i%dAStep + \\\n ", i, i); } - strb_appends(s, "0]\n"); + strb_appends(s, "0))\n"); } static void appendLoopOuter (strb* s, gen_kernel_ctx* ctx){ @@ -554,38 +617,57 @@ static void appendLoopInner (strb* s, } static void appendLoopMacroUndefs (strb* s, gen_kernel_ctx* ctx){ - strb_appends(s, "\t#undef FOROVER\n"); - strb_appends(s, "\t#undef ESCAPE\n"); - strb_appends(s, "\t#undef SRCINDEXER\n"); - strb_appends(s, "\t#undef RDXINDEXER\n"); - strb_appends(s, "\t#undef DSTMINDEXER\n"); - strb_appends(s, "\t#undef DSTAINDEXER\n"); + strb_appends(s, "#undef FOROVER\n"); + strb_appends(s, "#undef ESCAPE\n"); + strb_appends(s, "#undef SRCINDEXER\n"); + strb_appends(s, "#undef RDXINDEXER\n"); + strb_appends(s, "#undef DSTMINDEXER\n"); + strb_appends(s, "#undef DSTAINDEXER\n"); } static void computeAxisList (unsigned* axisList, unsigned numAxis, const unsigned* reduxList, unsigned reduxLen){ - unsigned i, j, f=0, r=numAxis-reduxLen; + unsigned i, f=0; for(i=0;idimensions, GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); if(ret){goto releaseGpudata;} - chunkSizeGD = gpudata_alloc(ctx, src->nd * sizeof(size_t), + chunkSizeGD = gpudata_alloc(ctx, hwAxisLen * sizeof(size_t), chunkSize, GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); if(ret){goto releaseGpudata;} @@ -784,25 +869,24 @@ static int invokeMaxAndArgmax (GpuKernel* k, }; void* args[] = { - (void*)&argstr.src, + (void*) argstr.src, (void*)&argstr.srcOff, - (void*)&argstr.srcSteps, - (void*)&argstr.srcSize, - (void*)&argstr.chunkSize, - (void*)&argstr.dstMax, + (void*) argstr.srcSteps, + (void*) argstr.srcSize, + (void*) argstr.chunkSize, + (void*) argstr.dstMax, (void*)&argstr.dstMaxOff, - (void*)&argstr.dstMaxSteps, - (void*)&argstr.dstArgmax, + (void*) argstr.dstMaxSteps, + (void*) argstr.dstArgmax, (void*)&argstr.dstArgmaxOff, - (void*)&argstr.dstArgmaxSteps + (void*) argstr.dstArgmaxSteps }; - /** * Call kernel, release arguments and return error code */ - ret = GpuKernel_call(k, 3, blockSize, gridSize, 0, args); + ret = GpuKernel_call(k, hwAxisLen, blockSize, gridSize, 0, args); releaseGpudata: gpudata_release(srcStepsGD); gpudata_release(srcSizeGD); diff --git a/tests/check_reduction.c b/tests/check_reduction.c index a26a369dac..cd62a9d26a 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -75,39 +75,55 @@ START_TEST(test_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,2}; - float* pSrc = malloc(sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMax = malloc(sizeof(*pMax) * dims[1] ); - size_t* pArgmax = malloc(sizeof(*pArgmax) * dims[1] ); + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMax = calloc(1, sizeof(*pMax) * dims[1] ); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) * dims[1] ); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); + + /** + * Initialize source data. + */ + for(i=0;i gtMax){ gtMax = v; - gtArgmax = i*dims[1]*dims[2] + k; + gtArgmax = i*dims[2] + k; + } + } + } + + ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); + ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); + } +}END_TEST + +START_TEST(test_idxtranspose){ + pcgSeed(1); + + /** + * We test here the same reduction as test_reduction, except with a + * reversed reduxList {2,0} instead of {0,2}. That should lead to a + * transposition of the argmax "coordinates" and thus a change in its + * "flattened" output version. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {2,0}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMax = calloc(1, sizeof(*pMax) * dims[1] ); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) * dims[1] ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = k*dims[0] + i; } } } @@ -132,6 +227,7 @@ Suite *get_suite(void) { tcase_set_timeout(tc, 8.0); tcase_add_test(tc, test_reduction); + tcase_add_test(tc, test_idxtranspose); suite_add_tcase(s, tc); return s; From e1a2476b83833a4730afd9704bdaeed90896099b Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 28 Sep 2016 14:47:10 -0400 Subject: [PATCH 032/597] Refactoring + working, but suboptimal, scheduler. Idempotent scheduler in gaIFLSchedule() is now lone remaining function left to implement. It must redistribute factors from the chunkSize factor lists to the blockSize and gridSize lists. --- src/gpuarray_reduction.c | 1116 ++++++++++++++++------------------- src/util/integerfactoring.c | 52 +- src/util/integerfactoring.h | 29 + tests/check_reduction.c | 3 + 4 files changed, 605 insertions(+), 595 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 752a83498b..2c36f5934d 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -22,137 +22,99 @@ /* Datatypes */ -struct gen_kernel_ctx{ - unsigned numIdx; +struct maxandargmax_ctx{ + /* Function Arguments. */ + GpuArray* dstMax; + GpuArray* dstArgmax; + const GpuArray* src; unsigned reduxLen; const unsigned* reduxList; - unsigned hwAxisLen; - const unsigned* hwAxisList; - unsigned numFreeIdx; + + /* General. */ + int ret; unsigned* axisList; + gpucontext* gpuCtx; + + /* Source code Generator. */ const char* dstMaxType; const char* dstArgmaxType; + unsigned ndd; + unsigned ndr; + unsigned nds; + unsigned ndh; + strb s; + char* sourceCode; + GpuKernel kernel; + + /* Scheduler */ + unsigned hwAxisList[3]; + size_t blockSize [3]; + size_t gridSize [3]; + size_t chunkSize [3]; + + /* Invoker */ + gpudata* srcStepsGD; + gpudata* srcSizeGD; + gpudata* chunkSizeGD; + gpudata* dstMaxStepsGD; + gpudata* dstArgmaxStepsGD; }; -typedef struct gen_kernel_ctx gen_kernel_ctx; +typedef struct maxandargmax_ctx maxandargmax_ctx; /* Function prototypes */ -static int axisInSet (unsigned v, - const unsigned* set, - size_t setLen, - size_t* where); -static int checkargsMaxAndArgmax (GpuArray* dstMax, - GpuArray* dstArgmax, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -static void selectHwAxes (unsigned* hwAxesLen, - unsigned* hwAxisList, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -static char* gensourceMaxAndArgmax (unsigned numIdx, - unsigned reduxLen, - const unsigned* reduxList, - const unsigned hwAxisLen, - const unsigned* hwAxisList, - const char* dstMaxType, - const char* dstArgmaxType); -static void appendKernel (strb* s, - gen_kernel_ctx* ctx); -static void appendTypedefs (strb* s, - gen_kernel_ctx* ctx); -static void appendPrototype (strb* s, - gen_kernel_ctx* ctx); -static void appendOffsets (strb* s, - gen_kernel_ctx* ctx); -static void appendIndexDeclarations(strb* s, - gen_kernel_ctx* ctx); -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue); -static void appendRangeCalculations(strb* s, - gen_kernel_ctx* ctx); -static void appendLoops (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopMacroDefs (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopOuter (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopInner (strb* s, - gen_kernel_ctx* ctx); -static void appendLoopMacroUndefs (strb* s, - gen_kernel_ctx* ctx); -static void computeAxisList (unsigned* axisList, - unsigned numAxis, - const unsigned* reduxList, - unsigned reduxLen); -static int compileMaxAndArgmax (GpuKernel* kernel, - const char* src, - gpucontext* ctx); -static void scheduleMaxAndArgmax (const GpuKernel* kernel, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList, - size_t* blockSize, - size_t* gridSize, - size_t* chunkSize); -static int invokeMaxAndArgmax (GpuKernel* kernel, - GpuArray* dstMax, - GpuArray* dstArgmax, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList, - unsigned hwAxisLen, - const unsigned* hwAxisList); +static int axisInSet (unsigned v, + const unsigned* set, + size_t setLen, + size_t* where); +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue); +static int maxandargmaxCheckargs (maxandargmax_ctx* ctx); +static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx); +static int maxandargmaxGenSource (maxandargmax_ctx* ctx); +static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx); +static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx); +static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx); +static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx); +static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx); +static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx); +static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx); +static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx); +static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx); +static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx); +static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx); +static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx); +static int maxandargmaxCompile (maxandargmax_ctx* ctx); +static int maxandargmaxSchedule (maxandargmax_ctx* ctx); +static int maxandargmaxInvoke (maxandargmax_ctx* ctx); +static int maxandargmaxCleanup (maxandargmax_ctx* ctx); /* Function implementation */ -GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, - GpuArray* dstArgmax, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - int ret = 0; - unsigned hwAxisLen = 0; - unsigned hwAxisList[3] = {0,0,0}; - const char* dstMaxType = NULL; - const char* dstArgmaxType = NULL; - char* s = NULL; - GpuKernel kernel; - - /* Sanity-check arguments */ - if(!checkargsMaxAndArgmax(dstMax, dstArgmax, src, reduxLen, reduxList)){ - return GA_INVALID_ERROR; - } - - /* Select hardware axis mapping */ - selectHwAxes(&hwAxisLen, hwAxisList, src, reduxLen, reduxList); - - /* Generate kernel source code */ - dstMaxType = gpuarray_get_type(src->typecode)->cluda_name; - dstArgmaxType = gpuarray_get_type(GA_SSIZE) ->cluda_name; - s = gensourceMaxAndArgmax(src->nd, - reduxLen, reduxList, - hwAxisLen, hwAxisList, - dstMaxType, dstArgmaxType); - if(!s){return GA_MEMORY_ERROR;} - - /* Compile kernel source code */ - ret = compileMaxAndArgmax(&kernel, s, gpudata_context(src->data)); - free(s); - if(ret != GA_NO_ERROR){ - return ret; +GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, + GpuArray* dstArgmax, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src, reduxLen, reduxList}, + *ctx = &ctxSTACK; + + if(maxandargmaxCheckargs (ctx) == GA_NO_ERROR && + maxandargmaxSelectHwAxes(ctx) == GA_NO_ERROR && + maxandargmaxGenSource (ctx) == GA_NO_ERROR && + maxandargmaxCompile (ctx) == GA_NO_ERROR && + maxandargmaxSchedule (ctx) == GA_NO_ERROR && + maxandargmaxInvoke (ctx) == GA_NO_ERROR){ + return maxandargmaxCleanup(ctx); + }else{ + return maxandargmaxCleanup(ctx); } - - /* Invoke compiled kernel */ - return invokeMaxAndArgmax(&kernel, dstMax, dstArgmax, src, - reduxLen, reduxList, hwAxisLen, hwAxisList); } /** @@ -165,10 +127,10 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. */ -static int axisInSet (unsigned v, - const unsigned* set, - size_t setLen, - size_t* where){ +static int axisInSet (unsigned v, + const unsigned* set, + size_t setLen, + size_t* where){ size_t i; for(i=0;iret = GA_NO_ERROR; + ctx->axisList = NULL; + ctx->gpuCtx = NULL; + + ctx->dstMaxType = ctx->dstArgmaxType = NULL; + ctx->ndh = 0; + ctx->s = (strb)STRB_STATIC_INIT; + ctx->sourceCode = NULL; + + ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0; + ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1; + ctx->gridSize [0] = ctx->gridSize [1] = ctx->gridSize [2] = 1; + ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1; + + ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = + ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL; + /* Insane src or reduxLen? */ - if(!dstMax || !dstArgmax || !src || src->nd == 0 || reduxLen == 0 || - reduxLen >= src->nd){ - return 0; + if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 || + ctx->reduxLen == 0 || ctx->reduxLen >= ctx->src->nd){ + return ctx->ret=GA_INVALID_ERROR; } - for(i=0;i<(int)reduxLen;i++){ - /* Insane or duplicate list entry? */ - if(reduxList[i] >= src->nd || - axisInSet(reduxList[i], reduxList, i, 0)){ - return 0; + /* Insane or duplicate list entry? */ + for(i=0;ireduxLen;i++){ + if(ctx->reduxList[i] >= ctx->src->nd || + axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ + return ctx->ret=GA_INVALID_ERROR; } } - return 1; + /* Unknown type? */ + ctx->dstMaxType = gpuarray_get_type(ctx->src->typecode)->cluda_name; + ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE) ->cluda_name; + if(!ctx->dstMaxType || !ctx->dstArgmaxType){ + return ctx->ret=GA_INVALID_ERROR; + } + + /* GPU context non-existent? */ + ctx->gpuCtx = GpuArray_context(ctx->src); + if(!ctx->gpuCtx){ + return ctx->ret=GA_INVALID_ERROR; + } + + + /** + * We initialize some more parts of the context, using the guarantees + * we now have about the sanity of the arguments. + */ + + ctx->nds = ctx->src->nd; + ctx->ndr = ctx->reduxLen; + ctx->ndd = ctx->nds - ctx->ndr; + + return ctx->ret; } /** * @brief Select which axes (up to 3) will be assigned to hardware * dimensions. - * - * @param [out] hwAxisLen - * @param [out] hwAxisList - * @param [in] src - * @param [in] reduxLen - * @param [in] reduxList */ -static void selectHwAxes (unsigned* hwAxisLen, - unsigned* hwAxisList, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - unsigned i, j; - - *hwAxisLen = src->nd-reduxLen < 3 ? src->nd-reduxLen : 3; - - for(i=0;i<*hwAxisLen;i++){ - size_t maxV = 0; - unsigned maxI = 0; - - for(j=0;jnd;j++){ - if(!axisInSet(j, hwAxisList, i, 0) && - !axisInSet(j, reduxList, reduxLen, 0) && - src->dimensions[j] > maxV){ - maxV = src->dimensions[j]; +static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ + unsigned i, j, maxI = 0; + size_t maxV = 0; + + ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3; + + /** + * The ctx->hwAxisLen largest axes are selected and assigned in + * descending order to X, Y, Z. + */ + + for(i=0;indh;i++){ + for(j=0;jnds;j++){ + if(!axisInSet(j, ctx->hwAxisList, i, 0) && + !axisInSet(j, ctx->reduxList, ctx->ndr, 0) && + ctx->src->dimensions[j] > maxV){ + maxV = ctx->src->dimensions[j]; maxI = j; } } - hwAxisList[i] = maxI; + ctx->hwAxisList[i] = maxI; } + + return ctx->ret=GA_NO_ERROR; } /** * @brief Generate the kernel code for MaxAndArgmax. * - * @param [in] numIdx - * @param [in] reduxLen - * @param [in] reduxList - * @param [in] hwAxisLen - * @param [in] hwAxisList - * @param [in] dstMaxType - * @param [in] dstArgmaxType - * @return A free()'able string containing source code implementing the - * kernel, or else NULL. + * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. */ -static char* gensourceMaxAndArgmax (unsigned numIdx, - const unsigned reduxLen, - const unsigned* reduxList, - const unsigned hwAxisLen, - const unsigned* hwAxisList, - const char* dstMaxType, - const char* dstArgmaxType){ - /* Save the parameters of the reduction in a generator context. */ - gen_kernel_ctx ctx; - ctx.numIdx = numIdx; - ctx.reduxLen = reduxLen; - ctx.reduxList = reduxList; - ctx.hwAxisLen = hwAxisLen; - ctx.hwAxisList = hwAxisList; - ctx.numFreeIdx = ctx.numIdx - ctx.reduxLen; - ctx.axisList = malloc(numIdx*sizeof(unsigned)); - ctx.dstMaxType = dstMaxType; - ctx.dstArgmaxType = dstArgmaxType; - if(!ctx.axisList){ - return NULL; - } - +static int maxandargmaxGenSource (maxandargmax_ctx* ctx){ /* Compute internal axis remapping. */ - computeAxisList(ctx.axisList, ctx.numIdx, ctx.reduxList, ctx.reduxLen); + ctx->axisList = malloc(ctx->nds * sizeof(unsigned)); + if(!ctx->axisList){ + return ctx->ret=GA_MEMORY_ERROR; + } + maxandargmaxComputeAxisList(ctx); /* Generate kernel proper. */ - strb s = STRB_STATIC_INIT; - strb_ensure(&s, 5*1024); - appendKernel(&s, &ctx); - free(ctx.axisList); + strb_ensure(&ctx->s, 5*1024); + maxandargmaxAppendKernel(ctx); + free(ctx->axisList); + ctx->axisList = NULL; + ctx->sourceCode = strb_cstr(&ctx->s); + if(!ctx->sourceCode){ + return ctx->ret=GA_MEMORY_ERROR; + } /* Return it. */ - return strb_cstr(&s); + return ctx->ret=GA_NO_ERROR; } -static void appendKernel (strb* s, - gen_kernel_ctx* ctx){ - appendTypedefs (s, ctx); - appendPrototype (s, ctx); - strb_appends (s, "{\n"); - appendOffsets (s, ctx); - appendIndexDeclarations(s, ctx); - appendRangeCalculations(s, ctx); - appendLoops (s, ctx); - strb_appends (s, "}\n"); +static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx){ + maxandargmaxAppendTypedefs (ctx); + maxandargmaxAppendPrototype (ctx); + strb_appends (&ctx->s, "{\n"); + maxandargmaxAppendOffsets (ctx); + maxandargmaxAppendIndexDeclarations(ctx); + maxandargmaxAppendRangeCalculations(ctx); + maxandargmaxAppendLoops (ctx); + strb_appends (&ctx->s, "}\n"); } -static void appendTypedefs (strb* s, - gen_kernel_ctx* ctx){ - strb_appends(s, "/* Typedefs */\n"); - strb_appendf(s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); - strb_appendf(s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); - strb_appends(s, "\n"); - strb_appends(s, "\n"); - strb_appends(s, "\n"); +static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx){ + strb_appends(&ctx->s, "/* Typedefs */\n"); + strb_appendf(&ctx->s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); + strb_appendf(&ctx->s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); } -static void appendPrototype (strb* s, - gen_kernel_ctx* ctx){ - strb_appends(s, "KERNEL void maxandargmax(const T* src,\n"); - strb_appends(s, " const X srcOff,\n"); - strb_appends(s, " const X* srcSteps,\n"); - strb_appends(s, " const X* srcSize,\n"); - strb_appends(s, " const X* chunkSize,\n"); - strb_appends(s, " T* dstMax,\n"); - strb_appends(s, " const X dstMaxOff,\n"); - strb_appends(s, " const X* dstMaxSteps,\n"); - strb_appends(s, " X* dstArgmax,\n"); - strb_appends(s, " const X dstArgmaxOff,\n"); - strb_appends(s, " const X* dstArgmaxSteps)"); +static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx){ + strb_appends(&ctx->s, "KERNEL void maxandargmax(const T* src,\n"); + strb_appends(&ctx->s, " const X srcOff,\n"); + strb_appends(&ctx->s, " const X* srcSteps,\n"); + strb_appends(&ctx->s, " const X* srcSize,\n"); + strb_appends(&ctx->s, " const X* chunkSize,\n"); + strb_appends(&ctx->s, " T* dstMax,\n"); + strb_appends(&ctx->s, " const X dstMaxOff,\n"); + strb_appends(&ctx->s, " const X* dstMaxSteps,\n"); + strb_appends(&ctx->s, " X* dstArgmax,\n"); + strb_appends(&ctx->s, " const X dstArgmaxOff,\n"); + strb_appends(&ctx->s, " const X* dstArgmaxSteps)"); } -static void appendOffsets (strb* s, - gen_kernel_ctx* ctx){ - strb_appends(s, "\t/* Add offsets */\n"); - strb_appends(s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); - strb_appends(s, "\tdstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); - strb_appends(s, "\tdstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); - strb_appends(s, "\t\n"); - strb_appends(s, "\t\n"); +static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ + strb_appends(&ctx->s, "\t/* Add offsets */\n"); + strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); + strb_appends(&ctx->s, "\tdstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); + strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); + strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); } -static void appendIndexDeclarations(strb* s, - gen_kernel_ctx* ctx){ +static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ unsigned i; - strb_appends(s, "\t/* GPU kernel coordinates. Always 3D. */\n"); - - strb_appends(s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); - strb_appends(s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); - strb_appends(s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); - strb_appends(s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - strb_appends(s, "\tX "); - for(i=0;ihwAxisLen;i++){ - strb_appendf(s, "ci%u = chunkSize[%u]%s", - i, i, (i==ctx->hwAxisLen-1) ? ";\n" : ", "); + strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); + + strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); + strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); + strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); + strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); + strb_appends(&ctx->s, "\tX "); + for(i=0;indh;i++){ + strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", + i, i, (i==ctx->ndh-1) ? ";\n" : ", "); } - strb_appends(s, "\t\n"); - strb_appends(s, "\t\n"); - strb_appends(s, "\t/* Free indices & Reduction indices */\n"); - - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Dim", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "Start", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "End", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numIdx, "SStep", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "MStep", ";\n"); - appendIdxes (s, "\tX ", "i", 0, ctx->numFreeIdx, "AStep", ";\n"); - appendIdxes (s, "\tX ", "i", ctx->numFreeIdx, ctx->numIdx, "PDim", ";\n"); - - strb_appends(s, "\t\n"); - strb_appends(s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); + + appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n"); + appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n"); + appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n"); + appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n"); + appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n"); + appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n"); + appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n"); + appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n"); + + strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); } -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue){ - int i; - - prologue = prologue ? prologue : ""; - prefix = prefix ? prefix : ""; - suffix = suffix ? suffix : ""; - epilogue = epilogue ? epilogue : ""; - - strb_appends(s, prologue); - for(i=startIdx;is, "\t/* Compute ranges for this thread. */\n"); - for(i=0;inumIdx ;i++){/* i*Dim = srcSize[*]; */ - strb_appendf(s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); + for(i=0;inds;i++){ + strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); } - for(i=0;inumIdx ;i++){/* i*SStep = srcSteps[*]; */ - strb_appendf(s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); + for(i=0;inds;i++){ + strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); } - for(i=0;inumFreeIdx;i++){/* i*MStep = dstMaxSteps[*]; */ - strb_appendf(s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); + for(i=0;indd;i++){ + strb_appendf(&ctx->s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); } - for(i=0;inumFreeIdx;i++){/* i*AStep = dstArgmaxSteps[*]; */ - strb_appendf(s, "\ti%dAStep = dstArgmaxSteps[%d];\n", i, i); + for(i=0;indd;i++){ + strb_appendf(&ctx->s, "\ti%dAStep = dstArgmaxSteps[%d];\n", i, i); } - for(i=ctx->numIdx-1;i>=ctx->numFreeIdx;i--){/* i*PDim = ...; */ + for(i=ctx->nds-1;i>=ctx->ndd;i--){ /** * If this is the last index, it's the first cumulative dimension * product we generate, and thus we initialize to 1. */ - if(i == ctx->numIdx-1){ - strb_appendf(s, "\ti%dPDim = 1;\n", i); + if(i == ctx->nds-1){ + strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); }else{ - strb_appendf(s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); + strb_appendf(&ctx->s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); } } - for(i=0;inumIdx ;i++){/* i*Start = ...; */ + for(i=0;inds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->hwAxisLen, &hwDim)){ - strb_appendf(s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); + if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ - strb_appendf(s, "\ti%dStart = 0;\n", i); + strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); } } - for(i=0;inumIdx ;i++){/* i*End = ...; */ + for(i=0;inds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->hwAxisLen, &hwDim)){ - strb_appendf(s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); + if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ - strb_appendf(s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); + strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } } - strb_appends(s, "\t\n"); - strb_appends(s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); } -static void appendLoops (strb* s, - gen_kernel_ctx* ctx){ - strb_appends(s, "\t/**\n"); - strb_appends(s, "\t * FREE LOOPS.\n"); - strb_appends(s, "\t */\n"); - strb_appends(s, "\t\n"); - - appendLoopMacroDefs (s, ctx); - appendLoopOuter (s, ctx); - appendLoopMacroUndefs(s, ctx); +static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx){ + strb_appends(&ctx->s, "\t/**\n"); + strb_appends(&ctx->s, "\t * FREE LOOPS.\n"); + strb_appends(&ctx->s, "\t */\n"); + strb_appends(&ctx->s, "\t\n"); + + maxandargmaxAppendLoopMacroDefs (ctx); + maxandargmaxAppendLoopOuter (ctx); + maxandargmaxAppendLoopMacroUndefs(ctx); } -static void appendLoopMacroDefs (strb* s, - gen_kernel_ctx* ctx){ +static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ int i; /** * FOROVER Macro */ - strb_appends(s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); + strb_appends(&ctx->s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); /** * ESCAPE Macro */ - strb_appends(s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); + strb_appends(&ctx->s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); /** * SRCINDEXER Macro */ - appendIdxes (s, "#define SRCINDEXER(", "i", 0, ctx->numIdx, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); - for(i=0;inumIdx;i++){ - strb_appendf(s, "i%d*i%dSStep + \\\n ", i, i); + appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); + for(i=0;inds;i++){ + strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n ", i, i); } - strb_appends(s, "0))\n"); + strb_appends(&ctx->s, "0))\n"); /** * RDXINDEXER Macro */ - appendIdxes (s, "#define RDXINDEXER(", "i", ctx->numFreeIdx, ctx->numIdx, "", ") ("); - for(i=ctx->numFreeIdx;inumIdx;i++){ - strb_appendf(s, "i%d*i%dPDim + \\\n ", i, i); + appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ") ("); + for(i=ctx->ndd;inds;i++){ + strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); } - strb_appends(s, "0)\n"); + strb_appends(&ctx->s, "0)\n"); /** * DSTMINDEXER Macro */ - appendIdxes (s, "#define DSTMINDEXER(", "i", 0, ctx->numFreeIdx, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); - for(i=0;inumFreeIdx;i++){ - strb_appendf(s, "i%d*i%dMStep + \\\n ", i, i); + appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); + for(i=0;indd;i++){ + strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n ", i, i); } - strb_appends(s, "0))\n"); + strb_appends(&ctx->s, "0))\n"); /** * DSTAINDEXER Macro */ - appendIdxes (s, "#define DSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); - for(i=0;inumFreeIdx;i++){ - strb_appendf(s, "i%d*i%dAStep + \\\n ", i, i); + appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); + for(i=0;indd;i++){ + strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n ", i, i); } - strb_appends(s, "0))\n"); + strb_appends(&ctx->s, "0))\n"); } -static void appendLoopOuter (strb* s, - gen_kernel_ctx* ctx){ +static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ int i; /** * Outer Loop Header Generation */ - for(i=0;inumFreeIdx;i++){ - strb_appendf(s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); + for(i=0;indd;i++){ + strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Generation */ - appendLoopInner(s, ctx); + maxandargmaxAppendLoopInner(ctx); /** * Outer Loop Trailer Generation */ - for(i=0;inumFreeIdx;i++){ - strb_appends(s, "\t}\n"); + for(i=0;indd;i++){ + strb_appends(&ctx->s, "\t}\n"); } } -static void appendLoopInner (strb* s, - gen_kernel_ctx* ctx){ +static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx){ int i; /** * Inner Loop Prologue */ - strb_appends(s, "\t/**\n"); - strb_appends(s, "\t * Reduction initialization.\n"); - strb_appends(s, "\t */\n"); - strb_appends(s, "\t\n"); + strb_appends(&ctx->s, "\t/**\n"); + strb_appends(&ctx->s, "\t * Reduction initialization.\n"); + strb_appends(&ctx->s, "\t */\n"); + strb_appends(&ctx->s, "\t\n"); - appendIdxes (s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->numFreeIdx, "", ""); - if(ctx->numFreeIdx && ctx->reduxLen){strb_appends(s, ",");} - appendIdxes (s, "", "i", ctx->numFreeIdx, ctx->numIdx, "Start", ");\n"); + appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", ""); + if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");} + appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n"); - appendIdxes (s, "\tX maxI = RDXINDEXER(", "i", ctx->numFreeIdx, ctx->numIdx, "Start", ");\n"); + appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n"); - strb_appends(s, "\t\n"); - strb_appends(s, "\t/**\n"); - strb_appends(s, "\t * REDUCTION LOOPS.\n"); - strb_appends(s, "\t */\n"); - strb_appends(s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t/**\n"); + strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n"); + strb_appends(&ctx->s, "\t */\n"); + strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Header Generation */ - for(i=ctx->numFreeIdx;inumIdx;i++){ - strb_appendf(s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); + for(i=ctx->ndd;inds;i++){ + strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Body Generation */ - appendIdxes (s, "\tT V = SRCINDEXER(", "i", 0, ctx->numIdx, "", ");\n"); - strb_appends(s, "\t\n"); - strb_appends(s, "\tif(V > maxV){\n"); - strb_appends(s, "\t\tmaxV = V;\n"); - appendIdxes (s, "\t\tmaxI = RDXINDEXER(", "i", ctx->numFreeIdx, ctx->numIdx, "", ");\n"); - strb_appends(s, "\t}\n"); + appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n"); + strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\tif(V > maxV){\n"); + strb_appends(&ctx->s, "\t\tmaxV = V;\n"); + appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); + strb_appends(&ctx->s, "\t}\n"); /** * Inner Loop Trailer Generation */ - for(i=ctx->numFreeIdx;inumIdx;i++){ - strb_appends(s, "\t}\n"); + for(i=ctx->ndd;inds;i++){ + strb_appends(&ctx->s, "\t}\n"); } - strb_appends(s, "\t\n"); + strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Epilogue Generation */ - strb_appends(s, "\t/**\n"); - strb_appends(s, "\t * Destination writeback.\n"); - strb_appends(s, "\t */\n"); - strb_appends(s, "\t\n"); - appendIdxes (s, "\tDSTMINDEXER(", "i", 0, ctx->numFreeIdx, "", ") = maxV;\n"); - appendIdxes (s, "\tDSTAINDEXER(", "i", 0, ctx->numFreeIdx, "", ") = maxI;\n"); + strb_appends(&ctx->s, "\t/**\n"); + strb_appends(&ctx->s, "\t * Destination writeback.\n"); + strb_appends(&ctx->s, "\t */\n"); + strb_appends(&ctx->s, "\t\n"); + appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n"); + appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n"); } -static void appendLoopMacroUndefs (strb* s, - gen_kernel_ctx* ctx){ - strb_appends(s, "#undef FOROVER\n"); - strb_appends(s, "#undef ESCAPE\n"); - strb_appends(s, "#undef SRCINDEXER\n"); - strb_appends(s, "#undef RDXINDEXER\n"); - strb_appends(s, "#undef DSTMINDEXER\n"); - strb_appends(s, "#undef DSTAINDEXER\n"); +static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){ + strb_appends(&ctx->s, "#undef FOROVER\n"); + strb_appends(&ctx->s, "#undef ESCAPE\n"); + strb_appends(&ctx->s, "#undef SRCINDEXER\n"); + strb_appends(&ctx->s, "#undef RDXINDEXER\n"); + strb_appends(&ctx->s, "#undef DSTMINDEXER\n"); + strb_appends(&ctx->s, "#undef DSTAINDEXER\n"); } -static void computeAxisList (unsigned* axisList, - unsigned numAxis, - const unsigned* reduxList, - unsigned reduxLen){ +static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ unsigned i, f=0; - for(i=0;inds;i++){ + if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ continue; } - axisList[f++] = i; + ctx->axisList[f++] = i; } - memcpy(&axisList[f], reduxList, reduxLen*sizeof(*reduxList)); + memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); } /** * @brief Compile the kernel from source code. * - * @param [out] kernel - * @param [in] src - * @param [in] ctx * @return */ -static int compileMaxAndArgmax (GpuKernel* kernel, - const char* src, - gpucontext* ctx){ - const int ARG_TYPECODE[11] = { +static int maxandargmaxCompile (maxandargmax_ctx* ctx){ + const int ARG_TYPECODES[] = { GA_BUFFER, /* src */ GA_SIZE, /* srcOff */ GA_BUFFER, /* srcSteps */ @@ -664,235 +634,195 @@ static int compileMaxAndArgmax (GpuKernel* kernel, GA_SIZE, /* dstArgmaxOff */ GA_BUFFER /* dstArgmaxSteps */ }; - - const size_t l = strlen(src); - return GpuKernel_init(kernel, ctx, 1, &src, &l, "maxandargmax", - 11, ARG_TYPECODE, GA_USE_CLUDA, (char**)0); + const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); + const char* SRCS[] = {ctx->sourceCode}; + const size_t SRC_LENS[] = {strlen(ctx->sourceCode)}; + const size_t SRCS_LEN = sizeof(SRCS)/sizeof(*SRCS); + + ctx->ret = GpuKernel_init(&ctx->kernel, + ctx->gpuCtx, + SRCS_LEN, + SRCS, + SRC_LENS, + "maxandargmax", + ARG_TYPECODES_LEN, + ARG_TYPECODES, + GA_USE_CLUDA, + (char**)0); + free(ctx->sourceCode); + ctx->sourceCode = NULL; + + return ctx->ret; } /** * Compute a good thread block size / grid size / software chunk size for Nvidia. */ -static void scheduleMaxAndArgmax (const GpuKernel* kernel, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList, - size_t* blockSize, - size_t* gridSize, - size_t* chunkSize){ - int i, j; - - /* Obtain the constraints of our problem. */ +static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ + int i; + + /** + * Obtain the constraints of our problem. + */ + size_t warpSize, maxL, maxL0, maxL1, maxL2, /* Maximum total and per-dimension thread/block sizes */ maxG, maxG0, maxG1, maxG2; /* Maximum total and per-dimension block /grid sizes */ - gpukernel_property(kernel->k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); - gpukernel_property(kernel->k, GA_KERNEL_PROP_MAXLSIZE, &maxL); - gpudata_property (src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); - gpudata_property (src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); - gpudata_property (src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); - gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE, &maxG); - gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); - gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); - gpudata_property (src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); - - int numRdxIdx = reduxLen; - int numFreeIdx = src->nd - numRdxIdx; - (void)numFreeIdx; + gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); + gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE, &maxG); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); /** - * Select which reduction dimensions will be associated with which hardware - * x, y and z dimensions. + * Prepare inputs to the solver. + * + * This involves, amongst others, + * - Initializing the blockSize, gridSize and chunkSize factor lists for all + * hardware dimensions. + * - Finding on which hardware axis is it optimal to place the warpSize factor. */ - int dims [3]; - uint64_t dimSize [3] = { 1, 1, 1}; - double slack [3] = {1.1, 1.1, 1.1}; - uint64_t kSmooth [3]; - ga_factor_list factDims[3]; - ga_factor_list factTBS [3]; - uint64_t tBS = 1; - uint64_t minThrd = 64; - uint64_t maxThrd = 256; - (void)dims; - - /************************************************************************ - * FIXME: Need logic to select up to 3 dimensions and plug them in dimSize! - * But what's the best dimension selection strategy to maximize - * memory bandwidth? - * Also need to fill out kSmooth[] based on all the GPU properties. - ************************************************************************/ - kSmooth[0] = maxL0; - kSmooth[1] = maxL1; - kSmooth[2] = maxL2; + unsigned bestWarpAxis = 0; + size_t bestWarpMod = 1; + uint64_t maxLg = maxL; + uint64_t maxLs[3] = {maxL0, maxL1, maxL2}; + uint64_t maxGg = maxG; + uint64_t maxGs[3] = {maxG0, maxG1, maxG2}; + uint64_t dims [3] = {1, 1, 1 }; + double slack[3] = {1.1, 1.1, 1.1 }; + ga_factor_list factBS[3]; + ga_factor_list factGS[3]; + ga_factor_list factCS[3]; + + for(i=0;indh;i++){ + dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]]; + gaIFLInit(&factBS[i]); + gaIFLInit(&factGS[i]); + gaIFLInit(&factCS[i]); + + size_t warpMod = dims[i]%warpSize; + if(bestWarpMod>0 && (warpMod==0 || warpMod>bestWarpMod)){ + bestWarpAxis = i; + bestWarpMod = warpMod; + } + } + + dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; + gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); /** * Factorization job. We'll steadily increase the slack in case of failure - * in order to ensure we do get a factorization. + * in order to ensure we do get a factorization, which we place into + * chunkSize. */ - for(i=0;indh;i++){ + while(!gaIFactorize(dims[i], dims[i]*slack[i], maxLs[i], &factCS[i])){ /** - * Error! Failed to factorize dimension "xyz"[i] with given slack - * and k-smoothness constraints! Increase slack. Once slack reaches + * Error! Failed to factorize dimension i with given slack and + * k-smoothness constraints! Increase slack. Once slack reaches * 2.0 it will factorize guaranteed. */ + gaIFLInit(&factCS[i]); slack[i] += 0.1; } } /** - * Use the factorization. We "withdraw" factors from the factor lists one - * at a time until we enter our target zone thread#. If the individual - * maxLn in dimension n is about to be breached, we move on to the next - * dimension. - * - * The same process is then repeated with respect to grid size. + * Invoke the scheduler. * - * What's left after that is software blocking. + * The scheduler will move some factors from chunkSize into blockSize and + * gridSize, improving performance. */ - gaIFLInit(&factTBS[0]); - gaIFLInit(&factTBS[1]); - gaIFLInit(&factTBS[2]); - for(i=0;i 0){ - factDims[i].p[j]--; - gaIFLAddFactors(&factTBS[i], factDims[i].f[j], 1); - tBS *= factDims[i].f[j]; - - if(tBS >= minThrd && tBS <= maxThrd){ - goto computeBS; - } - } - } + gaIFLSchedule(ctx->ndh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); + + /* Output. */ + for(i=0;indh;i++){ + ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]); + ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]); + ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]); } - computeBS: - blockSize[0] = gaIFLGetProduct(&factTBS[0]); - blockSize[1] = gaIFLGetProduct(&factTBS[1]); - blockSize[2] = gaIFLGetProduct(&factTBS[2]); - gridSize [0] = gaIFLGetProduct(&factDims[0]) / blockSize[0]; - gridSize [1] = gaIFLGetProduct(&factDims[1]) / blockSize[1]; - gridSize [2] = gaIFLGetProduct(&factDims[2]) / blockSize[2]; + /* Return. */ + return ctx->ret=GA_NO_ERROR; } /** * Invoke the kernel. */ -static int invokeMaxAndArgmax (GpuKernel* k, - GpuArray* dstMax, - GpuArray* dstArgmax, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList, - unsigned hwAxisLen, - const unsigned* hwAxisList){ - int ret = 0; - size_t blockSize[3] = {1,1,1}; - size_t gridSize [3] = {1,1,1}; - size_t chunkSize[3] = {50,1,1}; - gpudata* srcStepsGD = 0, *srcSizeGD = 0, *chunkSizeGD = 0, - *dstMaxStepsGD = 0, *dstArgmaxStepsGD = 0; - gpucontext* ctx = GpuArray_context(src); - - - /** - * Schedule the kernel. - * - * This implies choosing the block, grid and chunk size appropriately. - */ - /* - scheduleMaxAndArgmax(k, src, reduxLen, reduxList, - blockSize, gridSize, chunkSize); - */ - (void)scheduleMaxAndArgmax; - +static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ /** * Argument Marshalling. This the grossest gross thing in here. */ - srcStepsGD = gpudata_alloc(ctx, src->nd * sizeof(size_t), - src->strides, - GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); - if(ret){goto releaseGpudata;} - srcSizeGD = gpudata_alloc(ctx, src->nd * sizeof(size_t), - src->dimensions, - GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); - if(ret){goto releaseGpudata;} - chunkSizeGD = gpudata_alloc(ctx, hwAxisLen * sizeof(size_t), - chunkSize, - GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); - if(ret){goto releaseGpudata;} - dstMaxStepsGD = gpudata_alloc(ctx, (src->nd - reduxLen) * sizeof(size_t), - dstMax->strides, - GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); - if(ret){goto releaseGpudata;} - dstArgmaxStepsGD = gpudata_alloc(ctx, (src->nd - reduxLen) * sizeof(size_t), - dstArgmax->strides, - GA_BUFFER_READ_ONLY|GA_BUFFER_INIT, &ret); - if(ret){goto releaseGpudata;} - - - struct MaxAndArgmaxArgs{ - gpudata* src; - size_t srcOff; - gpudata* srcSteps; - gpudata* srcSize; - gpudata* chunkSize; - gpudata* dstMax; - size_t dstMaxOff; - gpudata* dstMaxSteps; - gpudata* dstArgmax; - size_t dstArgmaxOff; - gpudata* dstArgmaxSteps; - } argstr = { - src->data, - src->offset, - srcStepsGD, - srcSizeGD, - chunkSizeGD, - dstMax->data, - dstMax->offset, - dstMaxStepsGD, - dstArgmax->data, - dstArgmax->offset, - dstArgmaxStepsGD - }; - + const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; + ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->strides, flags, 0); + ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->dimensions, flags, 0); + ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), + ctx->chunkSize, flags, 0); + ctx->dstMaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dstMax->strides, flags, 0); + ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dstArgmax->strides, flags, 0); void* args[] = { - (void*) argstr.src, - (void*)&argstr.srcOff, - (void*) argstr.srcSteps, - (void*) argstr.srcSize, - (void*) argstr.chunkSize, - (void*) argstr.dstMax, - (void*)&argstr.dstMaxOff, - (void*) argstr.dstMaxSteps, - (void*) argstr.dstArgmax, - (void*)&argstr.dstArgmaxOff, - (void*) argstr.dstArgmaxSteps + (void*) ctx->src->data, + (void*)&ctx->src->offset, + (void*) ctx->srcStepsGD, + (void*) ctx->srcSizeGD, + (void*) ctx->chunkSizeGD, + (void*) ctx->dstMax->data, + (void*)&ctx->dstMax->offset, + (void*) ctx->dstMaxStepsGD, + (void*) ctx->dstArgmax->data, + (void*)&ctx->dstArgmax->offset, + (void*) ctx->dstArgmaxStepsGD }; - /** - * Call kernel, release arguments and return error code - */ + if(ctx->srcStepsGD && + ctx->srcSizeGD && + ctx->chunkSizeGD && + ctx->dstMaxStepsGD && + ctx->dstArgmaxStepsGD){ + ctx->ret = GpuKernel_call(&ctx->kernel, + ctx->ndh, + ctx->blockSize, + ctx->gridSize, + 0, + args); + }else{ + ctx->ret = GA_MEMORY_ERROR; + } + + gpudata_release(ctx->srcStepsGD); + gpudata_release(ctx->srcSizeGD); + gpudata_release(ctx->chunkSizeGD); + gpudata_release(ctx->dstMaxStepsGD); + gpudata_release(ctx->dstArgmaxStepsGD); + + return ctx->ret; +} + +/** + * Cleanup + */ + +static int maxandargmaxCleanup (maxandargmax_ctx* ctx){ + free(ctx->axisList); + free(ctx->sourceCode); + ctx->axisList = NULL; + ctx->sourceCode = NULL; - ret = GpuKernel_call(k, hwAxisLen, blockSize, gridSize, 0, args); - releaseGpudata: - gpudata_release(srcStepsGD); - gpudata_release(srcSizeGD); - gpudata_release(chunkSizeGD); - gpudata_release(dstMaxStepsGD); - gpudata_release(dstArgmaxStepsGD); - return ret; + return ctx->ret; } diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 88f344f084..4ac0fca73e 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -560,7 +560,7 @@ uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl){ int i; for(i=0;i<15;i++){ - if(f < fl->f[i]){ + if(fl->p[i] > 0 && f < fl->f[i]){ f = fl->f[i]; } } @@ -608,4 +608,52 @@ int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl){ return total; } - +void gaIFLSchedule(const unsigned n, + const uint64_t maxBtot, + const uint64_t* maxBind, + const uint64_t maxGtot, + const uint64_t* maxGind, + ga_factor_list* factBS, + ga_factor_list* factGS, + ga_factor_list* factCS){ +#if 0 + printf("BS: (%6llu, %6llu, %6llu)\n", + (unsigned long long)gaIFLGetProduct(&factBS[0]), + (unsigned long long)gaIFLGetProduct(&factBS[1]), + (unsigned long long)gaIFLGetProduct(&factBS[2])); + printf("GS: (%6llu, %6llu, %6llu)\n", + (unsigned long long)gaIFLGetProduct(&factGS[0]), + (unsigned long long)gaIFLGetProduct(&factGS[1]), + (unsigned long long)gaIFLGetProduct(&factGS[2])); + printf("CS: (%6llu, %6llu, %6llu)\n", + (unsigned long long)gaIFLGetProduct(&factCS[0]), + (unsigned long long)gaIFLGetProduct(&factCS[1]), + (unsigned long long)gaIFLGetProduct(&factCS[2])); + printf("\n"); +#endif + + /** + * First, we move factors from factBS[i] and factGS[i] to factCS[i], in + * order of largest to smallest, until their product is at or below + * maxBind[i] and maxGind[i] respectively. + */ + + /** + * Then we move out more factors from factBS[i] and factGS[i], in order of + * smallest to largest, until their common product is at or below maxBtot + * and maxGtot respectively. + */ + + /** + * At this point, the scheduling is guaranteed to be valid, but may be + * nowhere close to optimal. + * + * So we start moving in factors from factCS[i] to factBS[i], in order of + * largest to smallest, while remaining below maxBtot and maxBind[i]. + */ + + /** + * Lastly, we move in factors from factCS[i] to factBG[i], in order of + * largest to smallest, while remaining below maxGtot and maxGind[i]. + */ +} diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index b8443feb5a..b78b17ec51 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -186,6 +186,35 @@ uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl); int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl); +/** + * @brief Schedule block size, grid size and what's left over that fits in + * neither, which will be called "chunk" size, subject to constraints. + * + * @param [in] n Number of dimensions of the problem. The arrays + * maxBind, maxGind, factBS, factGS, factCS must have + * n elements. + * @param [in] maxBtot The product of the block sizes in all n dimensions + * will not exceed this value. + * @param [in] maxBind The block size in dimensions i=0..n-1 will not + * exceed maxBind[i]. + * @param [in] maxGtot The product of the grid sizes in all n dimensions + * will not exceed this value. + * @param [in] maxGind The grid size in dimensions i=0..n-1 will not + * exceed maxGind[i]. + * @param [in,out] factBS The block size for dimensions 0..n-1, as a factor list. + * @param [in,out] factGS The grid size for dimensions 0..n-1, as a factor list. + * @param [in,out] factCS The chunk size for dimensions 0..n-1, as a factor list. + */ + +void gaIFLSchedule(const unsigned n, + const uint64_t maxBtot, + const uint64_t* maxBind, + const uint64_t maxGtot, + const uint64_t* maxGind, + ga_factor_list* factBS, + ga_factor_list* factGS, + ga_factor_list* factCS); + /* End C++ Extern "C" Guard */ #ifdef __cplusplus diff --git a/tests/check_reduction.c b/tests/check_reduction.c index cd62a9d26a..6fba722db1 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -63,6 +63,9 @@ static double pcgRand01(void){ } +/** + * Test cases. + */ START_TEST(test_reduction){ pcgSeed(1); From e1c78f5d720d5fea2f0c2bf601d9ca6bd315abd2 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 29 Sep 2016 23:30:06 -0400 Subject: [PATCH 033/597] Optimizing Scheduler + Testcases Complete - Added optimizing scheduler & testcase. - Added very-high-rank testcase. - Random bugfixes for bugs exposed by tests. - Added a few member in struct ga_factor_list - New APIs for integer factorization --- src/gpuarray_reduction.c | 9 +- src/util/integerfactoring.c | 318 +++++++++++++++++++++++----- src/util/integerfactoring.h | 28 ++- tests/check_reduction.c | 150 +++++++++++-- tests/check_util_integerfactoring.c | 222 ++++++++++++++++++- 5 files changed, 654 insertions(+), 73 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 2c36f5934d..72438b5668 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -261,7 +261,7 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ unsigned i, j, maxI = 0; - size_t maxV = 0; + size_t maxV; ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3; @@ -271,10 +271,12 @@ static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ */ for(i=0;indh;i++){ + maxV = 0; + for(j=0;jnds;j++){ if(!axisInSet(j, ctx->hwAxisList, i, 0) && !axisInSet(j, ctx->reduxList, ctx->ndr, 0) && - ctx->src->dimensions[j] > maxV){ + ctx->src->dimensions[j] >= maxV){ maxV = ctx->src->dimensions[j]; maxI = j; } @@ -707,7 +709,7 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ gaIFLInit(&factCS[i]); size_t warpMod = dims[i]%warpSize; - if(bestWarpMod>0 && (warpMod==0 || warpMod>bestWarpMod)){ + if(bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){ bestWarpAxis = i; bestWarpMod = warpMod; } @@ -730,7 +732,6 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ * 2.0 it will factorize guaranteed. */ - gaIFLInit(&factCS[i]); slack[i] += 0.1; } } diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 4ac0fca73e..3667a317d7 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -58,6 +58,37 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); static int gaIFactorizeNextPow2(uint64_t n, ga_factor_list* fl); +/** + * @brief Satisfy individual product limits on "from" by moving factors to + * corresponding "to" list. + */ + +static void gaIFLScheduleSatisfyInd(const int n, + ga_factor_list* from, + ga_factor_list* to, + const uint64_t* maxInd); + +/** + * @brief Satisfy global product limit on "from" by moving factors to + * corresponding "to" list. + */ + +static void gaIFLScheduleSatisfyTot(const int n, + ga_factor_list* from, + ga_factor_list* to, + const uint64_t maxTot); + +/** + * @brief Optimize "to" by moving factors from "from", under both individual + * and global limits. + */ + +static void gaIFLScheduleOpt(const int n, + ga_factor_list* from, + ga_factor_list* to, + const uint64_t maxTot, + const uint64_t* maxInd); + /** @@ -492,48 +523,88 @@ void gaIFLInit(ga_factor_list* fl){ memset(fl, 0, sizeof(*fl)); } -int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, uint8_t p){ +int gaIFLFull(ga_factor_list* fl){ + return fl->d >= 15;/* Strictly speaking, fl->d never exceeds 15. */ +} + +int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){ int i; - /* Fast case: We're adding 0 powers of f. */ - if(p == 0){ + /** + * Fast case: We're adding 0 powers of f, or any powers of 1. The + * value of the factor list (and the integer it represents) is thus + * unchanged. + */ + + if(p == 0 || f == 1){ return 1; } - for(i=0;i<15;i++){ + /** + * Otherwise, the factor list has to change. We scan linearly the factor + * list for either a pre-existing spot or an insertion spot. Scanning + * linearly over a 15-element array is faster and less complex than binary + * search. + */ + + for(i=0;id;i++){ if(fl->f[i] == f){ - /* Fast case: Factor already in list. */ + /** + * Factor is already in list. + */ + fl->p[i] += p; + if(fl->p[i] == 0){ + /** + * We removed all factors f. Bump leftwards the remainder to + * maintain sorted order. + */ + + memmove(&fl->f[i], &fl->f[i+1], sizeof(fl->f[i])*(fl->d-i)); + memmove(&fl->p[i], &fl->p[i+1], sizeof(fl->p[i])*(fl->d-i)); + fl->d--; + } return 1; }else if(fl->f[i] > f){ /* Inject the factor at this place in order to keep list sorted, if we have the capacity. */ - if(fl->f[14] != 0){ + if(gaIFLFull(fl)){ /* We can't bump the list rightwards, it's full already! */ return 0; } - memmove(&fl->f[i+1], &fl->f[i], sizeof(fl->f[i])*(14-i)); - memmove(&fl->p[i+1], &fl->p[i], sizeof(fl->p[i])*(14-i)); - fl->f[i] = f; - fl->p[i] = p; - return 1; - }else if(fl->f[i] == 0){ - /* This is the biggest factor so far, and a slot still remains. */ + memmove(&fl->f[i+1], &fl->f[i], sizeof(fl->f[i])*(fl->d-i)); + memmove(&fl->p[i+1], &fl->p[i], sizeof(fl->p[i])*(fl->d-i)); fl->f[i] = f; fl->p[i] = p; + fl->d++; return 1; } } - return 0; + /** + * We looked at every factor in the list and f is strictly greater than + * all of them. + * + * If the list is full, we cannot insert f, but if it isn't, we can simply + * tack it at the end. + */ + + if(gaIFLFull(fl)){ + return 0; + }else{ + fl->f[fl->d] = f; + fl->p[fl->d] = p; + fl->d++; + return 1; + } } int gaIFLGetFactorPower(ga_factor_list* fl, uint64_t f){ int i; - for(i=0;i<15;i++){ + for(i=0;id;i++){ if(fl->f[i] == f){ return fl->p[i]; } @@ -546,7 +617,7 @@ uint64_t gaIFLGetProduct(const ga_factor_list* fl){ uint64_t p = 1; int i, j; - for(i=0;i<15;i++){ + for(i=0;id;i++){ for(j=0;jp[i];j++){ p *= fl->f[i]; } @@ -556,16 +627,62 @@ uint64_t gaIFLGetProduct(const ga_factor_list* fl){ } uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl){ - uint64_t f = 1; + return fl->d ? fl->f[fl->d-1] : 1; +} + +uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl){ + return fl->d ? fl->f[0] : 1; +} + +static uint64_t gaIFLGetProductv(int n, const ga_factor_list* fl){ + uint64_t p = 1; int i; - for(i=0;i<15;i++){ - if(fl->p[i] > 0 && f < fl->f[i]){ - f = fl->f[i]; + for(i=0;i 0){ + hasFactors = 1; + currF = gaIFLGetGreatestFactor(fl+i); + if(f <= currF){ + f = currF; + if(idx){*idx = i;} + } + } + } + + return hasFactors ? f : 1; +} + +static uint64_t gaIFLGetSmallestFactorv(int n, const ga_factor_list* fl, int* idx){ + uint64_t f = -1, currF; + int i, hasFactors=0; + + if(idx){*idx = 0;} + + for(i=0;i 0){ + hasFactors = 1; + currF = gaIFLGetSmallestFactor(fl+i); + if(f >= currF){ + f = currF; + if(idx){*idx = i;} + } } } - return f; + return hasFactors ? f : 1; } int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl){ @@ -576,7 +693,7 @@ int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl){ char* ptr = size ? str : NULL; /* Loop over all factors and spit them out. */ - for(i=0;i<15;i++){ + for(i=0;id;i++){ for(j=0;jp[i];j++){ total += snprintf(ptr, left, "%llu*", (unsigned long long)fl->f[i]); if(ptr){ @@ -608,29 +725,19 @@ int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl){ return total; } -void gaIFLSchedule(const unsigned n, - const uint64_t maxBtot, - const uint64_t* maxBind, - const uint64_t maxGtot, - const uint64_t* maxGind, - ga_factor_list* factBS, - ga_factor_list* factGS, - ga_factor_list* factCS){ -#if 0 - printf("BS: (%6llu, %6llu, %6llu)\n", - (unsigned long long)gaIFLGetProduct(&factBS[0]), - (unsigned long long)gaIFLGetProduct(&factBS[1]), - (unsigned long long)gaIFLGetProduct(&factBS[2])); - printf("GS: (%6llu, %6llu, %6llu)\n", - (unsigned long long)gaIFLGetProduct(&factGS[0]), - (unsigned long long)gaIFLGetProduct(&factGS[1]), - (unsigned long long)gaIFLGetProduct(&factGS[2])); - printf("CS: (%6llu, %6llu, %6llu)\n", - (unsigned long long)gaIFLGetProduct(&factCS[0]), - (unsigned long long)gaIFLGetProduct(&factCS[1]), - (unsigned long long)gaIFLGetProduct(&factCS[2])); - printf("\n"); -#endif +void gaIFLSchedule(const int n, + const uint64_t maxBtot, + const uint64_t* maxBind, + const uint64_t maxGtot, + const uint64_t* maxGind, + ga_factor_list* factBS, + ga_factor_list* factGS, + ga_factor_list* factCS){ + /** + * If we have zero dimensions, the scheduling job is easy. + */ + + if(n<=0){return;} /** * First, we move factors from factBS[i] and factGS[i] to factCS[i], in @@ -638,22 +745,137 @@ void gaIFLSchedule(const unsigned n, * maxBind[i] and maxGind[i] respectively. */ + gaIFLScheduleSatisfyInd(n, factBS, factCS, maxBind); + gaIFLScheduleSatisfyInd(n, factGS, factCS, maxGind); + /** * Then we move out more factors from factBS[i] and factGS[i], in order of - * smallest to largest, until their common product is at or below maxBtot + * smallest to largest, until their common product is at or below maxBtot * and maxGtot respectively. */ + gaIFLScheduleSatisfyTot(n, factBS, factCS, maxBtot); + gaIFLScheduleSatisfyTot(n, factGS, factCS, maxGtot); + /** * At this point, the scheduling is guaranteed to be valid, but may be * nowhere close to optimal. * * So we start moving in factors from factCS[i] to factBS[i], in order of * largest to smallest, while remaining below maxBtot and maxBind[i]. + * + * Lastly, we move in factors from factCS[i] to factBG[i], in order of + * largest to smallest, while remaining below maxGtot and maxGind[i]. */ + gaIFLScheduleOpt(n, factCS, factBS, maxBtot, maxBind); + gaIFLScheduleOpt(n, factCS, factGS, maxGtot, maxGind); +} + +static void gaIFLScheduleSatisfyInd(const int n, + ga_factor_list* from, + ga_factor_list* to, + const uint64_t* maxInd){ + int i; + uint64_t f, p; + + for(i=0;i maxInd[i]){ + if(p%f){ + f = gaIFLGetGreatestFactor(from+i); + } + p /= f; + gaIFLAddFactors(from+i, f, -1); + gaIFLAddFactors(to +i, f, +1); + } + } +} + +static void gaIFLScheduleSatisfyTot(const int n, + ga_factor_list* from, + ga_factor_list* to, + const uint64_t maxTot){ + int a, i, c; + uint64_t f, p; + + p = gaIFLGetProductv(n, from); + a = 0; + + while(p > maxTot){ + f = gaIFLGetSmallestFactorv(n, from, &a); + c = gaIFLGetFactorPower (from+a, f); + + for(i=c-1;i>=0 && p>maxTot;i--){ + p /= f; + gaIFLAddFactors(from+a, f, -1); + gaIFLAddFactors(to +a, f, +1); + } + } +} + +static void gaIFLScheduleOpt(const int n, + ga_factor_list* from, + ga_factor_list* to, + const uint64_t maxTot, + const uint64_t* maxInd){ + int i, j, k; + uint64_t maxFTot, maxFInd, currF, f; + uint64_t pInd[n], pTot = 1; + + /* Muzzle compiler about a random function being unused. */ + (void)gaIFLGetGreatestFactorv; + /** - * Lastly, we move in factors from factCS[i] to factBG[i], in order of - * largest to smallest, while remaining below maxGtot and maxGind[i]. + * Check whether optimization is possible. */ + + for(i=0;i=0;j--){ + currF = from[i].f[j]; + + if(currF <= maxFTot && currF <= maxFInd && currF >= f){ + f = currF; + k = i; + break; + } + } + } + + if(k == -1){ + break; + } + + gaIFLAddFactors(from+k, f, -1); + gaIFLAddFactors(to +k, f, +1); + pInd[k] *= f; + pTot *= f; + maxFTot = maxTot/pTot; + }while(maxFTot>1 && f>1); } diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index b78b17ec51..c6c3d6cd04 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -46,6 +46,7 @@ typedef struct ga_factor_list_ ga_factor_list; struct ga_factor_list_{ uint64_t f[16];/* Factors */ uint8_t p[16];/* Powers of factors */ + int d; /* Number of distinct factors. */ }; @@ -112,13 +113,14 @@ int gaIIsPrime(uint64_t n); * required. * * @param [in] n The integer to be factorized. Must be >0. - * @param [in] maxN The "slack" parameter. The factor list returned will not - * have a product that exceeds this number. + * @param [in] maxN The "slack" parameter. The factor list returned will + * not have a product that exceeds this number. * @param [in] k The k-smoothness constraint. k is the largest * acceptable factor in the output factor list. The * factorizer will, effectively, treat any number all of * whose prime factors exceed k as a prime. - * @param [out] fl The output factor list. + * @param [out] fl The output factor list. Does *NOT* need to be + * initialized. * @return Non-zero if a factorization is found that satisfies both slack and * smoothness constraints; Zero if no such factorization is found. * If this function returns zero, the last factor in the factor @@ -135,6 +137,16 @@ int gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl) void gaIFLInit(ga_factor_list* fl); +/** + * @brief Reports whether another *distinct* factor can be added to the factor + * list safely. + * + * @return Returns zero if there are less than 15 distinct factors in the list + * and non-zero otherwise. + */ + +int gaIFLFull(ga_factor_list* fl); + /** * @brief Add a factor f with power p to the factor list. * @@ -147,7 +159,7 @@ void gaIFLInit(ga_factor_list* fl); * @return Non-zero if factor successfully added; Zero otherwise. */ -int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, uint8_t p); +int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p); /** * @brief Get the power of a given factor within a factor list. @@ -170,6 +182,12 @@ uint64_t gaIFLGetProduct(const ga_factor_list* fl); uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl); +/** + * @brief Get the smallest factor in the factors list. + */ + +uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl); + /** * @brief Print out the factor list in a human-readable form, snprintf()-style. * @@ -206,7 +224,7 @@ int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl); * @param [in,out] factCS The chunk size for dimensions 0..n-1, as a factor list. */ -void gaIFLSchedule(const unsigned n, +void gaIFLSchedule(const int n, const uint64_t maxBtot, const uint64_t* maxBind, const uint64_t maxGtot, diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 6fba722db1..c83e5b9772 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -76,7 +76,7 @@ START_TEST(test_reduction){ */ size_t i,j,k; - size_t dims[3] = {32,50,79}; + size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const unsigned reduxList[] = {0,2}; @@ -142,6 +142,17 @@ START_TEST(test_reduction){ ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); + GpuArray_clear(&gaArgmax); }END_TEST START_TEST(test_idxtranspose){ @@ -155,13 +166,15 @@ START_TEST(test_idxtranspose){ */ size_t i,j,k; - size_t dims[3] = {32,50,79}; - size_t prodDims = dims[0]*dims[1]*dims[2]; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + size_t rdxDims[1] = {50}; + size_t rdxProdDims = rdxDims[0]; const unsigned reduxList[] = {2,0}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMax = calloc(1, sizeof(*pMax) * dims[1] ); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) * dims[1] ); + float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); + float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); @@ -185,9 +198,9 @@ START_TEST(test_idxtranspose){ GpuArray gaMax; GpuArray gaArgmax; - ga_assert_ok(GpuArray_empty(&gaSrc, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaMax, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaSrc, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaMax, ctx, GA_FLOAT, 1, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE, 1, rdxDims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -195,8 +208,8 @@ START_TEST(test_idxtranspose){ ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList)); - ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *dims[1], &gaMax)); - ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax)); + ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); + ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); /** @@ -221,16 +234,129 @@ START_TEST(test_idxtranspose){ ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); + GpuArray_clear(&gaArgmax); +}END_TEST + +START_TEST(test_veryhighrank){ + pcgSeed(1); + + /** + * Here we test a reduction of a random 8D tensor on four dimensions. + */ + + size_t i,j,k,l,m,n,o,p; + size_t dims [8] = {1171,373,2,1,2,1,2,1}; + size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; + size_t rdxDims[4] = {1171,373,1,2}; + size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; + const unsigned reduxList[] = {2,4,7,5}; + + float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); + float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; + } + } + } + } + } + + size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; + ck_assert_msg(gtMax == pMax[dstIdx], "Max value mismatch!"); + ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!"); + } + } + } + } + + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); + GpuArray_clear(&gaArgmax); }END_TEST Suite *get_suite(void) { Suite *s = suite_create("reduction"); TCase *tc = tcase_create("basic"); tcase_add_checked_fixture(tc, setup, teardown); - tcase_set_timeout(tc, 8.0); + tcase_set_timeout(tc, 15.0); tcase_add_test(tc, test_reduction); tcase_add_test(tc, test_idxtranspose); + tcase_add_test(tc, test_veryhighrank); suite_add_tcase(s, tc); return s; diff --git a/tests/check_util_integerfactoring.c b/tests/check_util_integerfactoring.c index 0815e03740..5205107081 100644 --- a/tests/check_util_integerfactoring.c +++ b/tests/check_util_integerfactoring.c @@ -7,12 +7,12 @@ #include #include "util/integerfactoring.h" + /** * Integer Factorization test */ -START_TEST(test_integerfactorization) -{ +START_TEST(test_integerfactorization){ ga_factor_list fl; /** @@ -56,8 +56,221 @@ START_TEST(test_integerfactorization) */ ck_assert_int_ne(gaIFactorize( 2196095973992233039ULL, 2196095973992233039ULL*1.01, 64, &fl), 0); -} -END_TEST +}END_TEST + +START_TEST(test_scheduler){ + /* We use here the CUDA limits of a CC 3.0 GPU as an example. */ + uint64_t maxBTot = 1024, maxBInd[] = { 1024, 1024, 64}, + maxGTot = 0xFFFFFFFF, maxGInd[] = {2147483647, 65535, 65535}, + warpSize = 32; + + int warpAxis; + uint64_t dims[3]; + ga_factor_list factBS[3], factGS[3], factCS[3]; + unsigned long long intbBS[3], intbGS[3], intbCS[3]; + unsigned long long intaBS[3], intaGS[3], intaCS[3]; + + /** + * NOTE: If you want to view befores-and-afters of scheduling, #define PRINT + * to something non-0. + */ +#define PRINT 0 + + /** + * + * Testcase: (895,1147,923) job, warpSize on axis 0. + * + */ + + { + warpAxis = 0; + dims[0] = 895; + dims[1] = 1141; + dims[2] = 923; + dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize; + + /** + * Factorization job must be successful. + */ + + ck_assert(gaIFactorize(warpAxis==0?warpSize:1, 0, maxBInd[0], factBS+0)); + ck_assert(gaIFactorize(warpAxis==1?warpSize:1, 0, maxBInd[1], factBS+1)); + ck_assert(gaIFactorize(warpAxis==2?warpSize:1, 0, maxBInd[2], factBS+2)); + ck_assert(gaIFactorize( 1, 0, maxBInd[0], factGS+0)); + ck_assert(gaIFactorize( 1, 0, maxBInd[1], factGS+1)); + ck_assert(gaIFactorize( 1, 0, maxBInd[2], factGS+2)); + ck_assert(gaIFactorize( dims[0], dims[0]*1.1, maxBInd[0], factCS+0)); + ck_assert(gaIFactorize( dims[1], dims[1]*1.1, maxBInd[1], factCS+1)); + ck_assert(gaIFactorize( dims[2], dims[2]*1.1, maxBInd[2], factCS+2)); + + intbBS[0] = gaIFLGetProduct(factBS+0); + intbBS[1] = gaIFLGetProduct(factBS+1); + intbBS[2] = gaIFLGetProduct(factBS+2); + intbGS[0] = gaIFLGetProduct(factGS+0); + intbGS[1] = gaIFLGetProduct(factGS+1); + intbGS[2] = gaIFLGetProduct(factGS+2); + intbCS[0] = gaIFLGetProduct(factCS+0); + intbCS[1] = gaIFLGetProduct(factCS+1); + intbCS[2] = gaIFLGetProduct(factCS+2); + + /** + * Ensure that factorization only *increases* the size of the problem. + */ + + ck_assert_uint_ge(intbCS[0], dims[0]); + ck_assert_uint_ge(intbCS[1], dims[1]); + ck_assert_uint_ge(intbCS[2], dims[2]); + + + /** + * Run scheduler. + */ + +#if PRINT + printf("Before:\n"); + printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]); + printf("GS: (%6llu, %6llu, %6llu)\n", intbGS[0], intbGS[1], intbGS[2]); + printf("CS: (%6llu, %6llu, %6llu)\n", intbCS[0], intbCS[1], intbCS[2]); +#endif + gaIFLSchedule(3, maxBTot, maxBInd, maxGTot, maxGInd, factBS, factGS, factCS); + intaBS[0] = gaIFLGetProduct(factBS+0); + intaBS[1] = gaIFLGetProduct(factBS+1); + intaBS[2] = gaIFLGetProduct(factBS+2); + intaGS[0] = gaIFLGetProduct(factGS+0); + intaGS[1] = gaIFLGetProduct(factGS+1); + intaGS[2] = gaIFLGetProduct(factGS+2); + intaCS[0] = gaIFLGetProduct(factCS+0); + intaCS[1] = gaIFLGetProduct(factCS+1); + intaCS[2] = gaIFLGetProduct(factCS+2); +#if PRINT + printf("After:\n"); + printf("BS: (%6llu, %6llu, %6llu)\n", intaBS[0], intaBS[1], intaBS[2]); + printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]); + printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]); +#endif + + /** + * Scheduling is only about moving factors between block/grid/chunk factor + * lists. Therefore, the three dimensions must not have changed size. + */ + + ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]); + ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]); + ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]); + + /** + * Verify that the individual limits and global limits on threads in a + * block and blocks in a grid are met. + */ + + ck_assert_uint_le(intaBS[0], maxBInd[0]); + ck_assert_uint_le(intaBS[1], maxBInd[1]); + ck_assert_uint_le(intaBS[2], maxBInd[2]); + ck_assert_uint_le(intaGS[0], maxGInd[0]); + ck_assert_uint_le(intaGS[1], maxGInd[1]); + ck_assert_uint_le(intaGS[2], maxGInd[2]); + ck_assert_uint_le(intaBS[0]*intaBS[1]*intaBS[2], maxBTot); + ck_assert_uint_le(intaGS[0]*intaGS[1]*intaGS[2], maxGTot); + } + + + /** + * + * Testcase: (1,1,121632959) job, warpSize on axis 2. + * + */ + + { + warpAxis = 2; + dims[0] = 1; + dims[1] = 1; + dims[2] = 121632959; + dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize; + + /** + * Factorization job must be successful. + */ + + ck_assert(gaIFactorize(warpAxis==0?warpSize:1, 0, maxBInd[0], factBS+0)); + ck_assert(gaIFactorize(warpAxis==1?warpSize:1, 0, maxBInd[1], factBS+1)); + ck_assert(gaIFactorize(warpAxis==2?warpSize:1, 0, maxBInd[2], factBS+2)); + ck_assert(gaIFactorize( 1, 0, maxBInd[0], factGS+0)); + ck_assert(gaIFactorize( 1, 0, maxBInd[1], factGS+1)); + ck_assert(gaIFactorize( 1, 0, maxBInd[2], factGS+2)); + ck_assert(gaIFactorize( dims[0], dims[0]*1.1, maxBInd[0], factCS+0)); + ck_assert(gaIFactorize( dims[1], dims[1]*1.1, maxBInd[1], factCS+1)); + ck_assert(gaIFactorize( dims[2], dims[2]*1.1, maxBInd[2], factCS+2)); + + intbBS[0] = gaIFLGetProduct(factBS+0); + intbBS[1] = gaIFLGetProduct(factBS+1); + intbBS[2] = gaIFLGetProduct(factBS+2); + intbGS[0] = gaIFLGetProduct(factGS+0); + intbGS[1] = gaIFLGetProduct(factGS+1); + intbGS[2] = gaIFLGetProduct(factGS+2); + intbCS[0] = gaIFLGetProduct(factCS+0); + intbCS[1] = gaIFLGetProduct(factCS+1); + intbCS[2] = gaIFLGetProduct(factCS+2); + + /** + * Ensure that factorization only *increases* the size of the problem. + */ + + ck_assert_uint_ge(intbCS[0], dims[0]); + ck_assert_uint_ge(intbCS[1], dims[1]); + ck_assert_uint_ge(intbCS[2], dims[2]); + + + /** + * Run scheduler. + */ + +#if PRINT + printf("Before:\n"); + printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]); + printf("GS: (%6llu, %6llu, %6llu)\n", intbGS[0], intbGS[1], intbGS[2]); + printf("CS: (%6llu, %6llu, %6llu)\n", intbCS[0], intbCS[1], intbCS[2]); +#endif + gaIFLSchedule(3, maxBTot, maxBInd, maxGTot, maxGInd, factBS, factGS, factCS); + intaBS[0] = gaIFLGetProduct(factBS+0); + intaBS[1] = gaIFLGetProduct(factBS+1); + intaBS[2] = gaIFLGetProduct(factBS+2); + intaGS[0] = gaIFLGetProduct(factGS+0); + intaGS[1] = gaIFLGetProduct(factGS+1); + intaGS[2] = gaIFLGetProduct(factGS+2); + intaCS[0] = gaIFLGetProduct(factCS+0); + intaCS[1] = gaIFLGetProduct(factCS+1); + intaCS[2] = gaIFLGetProduct(factCS+2); +#if PRINT + printf("After:\n"); + printf("BS: (%6llu, %6llu, %6llu)\n", intaBS[0], intaBS[1], intaBS[2]); + printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]); + printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]); +#endif + + /** + * Scheduling is only about moving factors between block/grid/chunk factor + * lists. Therefore, the three dimensions must not have changed size. + */ + + ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]); + ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]); + ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]); + + /** + * Verify that the individual limits and global limits on threads in a + * block and blocks in a grid are met. + */ + + ck_assert_uint_le(intaBS[0], maxBInd[0]); + ck_assert_uint_le(intaBS[1], maxBInd[1]); + ck_assert_uint_le(intaBS[2], maxBInd[2]); + ck_assert_uint_le(intaGS[0], maxGInd[0]); + ck_assert_uint_le(intaGS[1], maxGInd[1]); + ck_assert_uint_le(intaGS[2], maxGInd[2]); + ck_assert_uint_le(intaBS[0]*intaBS[1]*intaBS[2], maxBTot); + ck_assert_uint_le(intaGS[0]*intaGS[1]*intaGS[2], maxGTot); + } +}END_TEST @@ -66,6 +279,7 @@ Suite *get_suite(void){ TCase *tc = tcase_create("All"); tcase_add_test(tc, test_integerfactorization); + tcase_add_test(tc, test_scheduler); suite_add_tcase(s, tc); From 3ea75ae08dc82693ae2a1013af5c2962986df5a5 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 29 Sep 2016 23:35:37 -0400 Subject: [PATCH 034/597] Add #line 1 to CUDA preamble. This causes the line number information in debug printouts and debug information by NVRTC to match with the line numbers expected by the user. --- src/gpuarray_buffer_cuda.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index cfbc1e672f..a19f33f691 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -308,7 +308,8 @@ static const char CUDA_PREAMBLE[] = "#define store_half(p, v) (*(p) = __float2half_rn(v))\n" "#define GA_DECL_SHARED_PARAM(type, name)\n" "#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n" - "#define GA_WARP_SIZE warpSize\n"; + "#define GA_WARP_SIZE warpSize\n" + "#line 1\n"; /* XXX: add complex, quads, longlong */ /* XXX: add vector types */ From 29bf24945f7f493f5fb68d356f3e84ebda65665f Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 29 Sep 2016 23:37:23 -0400 Subject: [PATCH 035/597] Add NULL check to gpudata_release(). This gives it the same ignore semantics as free(NULL) and thus the same consequent simplifications in cleanup code. --- src/gpuarray_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index c8fb2008de..e1c2e12200 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -100,7 +100,9 @@ void gpudata_retain(gpudata *b) { } void gpudata_release(gpudata *b) { - ((partial_gpudata *)b)->ctx->ops->buffer_release(b); + if(b){ + ((partial_gpudata *)b)->ctx->ops->buffer_release(b); + } } int gpudata_share(gpudata *a, gpudata *b, int *ret) { From 33046a7bc1c08062019b71e7a4a51b4cc900a265 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 4 Oct 2016 18:37:56 -0400 Subject: [PATCH 036/597] Quotes around check executable name. --- tests/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0bbf109b05..ef772752de 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -49,12 +49,12 @@ target_link_libraries(check_util ${CHECK_LIBRARIES} gpuarray) add_test(test_util "${CMAKE_CURRENT_BINARY_DIR}/check_util") add_executable(check_util_integerfactoring main.c check_util_integerfactoring.c) -target_link_libraries(check_util_integerfactoring ${LIBS} gpuarray-static) -add_test(test_util_integerfactoring ${CMAKE_CURRENT_BINARY_DIR}/check_util_integerfactoring) +target_link_libraries(check_util_integerfactoring ${CHECK_LIBRARIES} gpuarray-static) +add_test(test_util_integerfactoring "${CMAKE_CURRENT_BINARY_DIR}/check_util_integerfactoring") add_executable(check_reduction main.c device.c check_reduction.c) -target_link_libraries(check_reduction ${LIBS} gpuarray) -add_test(test_reduction ${CMAKE_CURRENT_BINARY_DIR}/check_reduction) +target_link_libraries(check_reduction ${CHECK_LIBRARIES} gpuarray) +add_test(test_reduction "${CMAKE_CURRENT_BINARY_DIR}/check_reduction") add_executable(check_array main.c device.c check_array.c) target_link_libraries(check_array ${CHECK_LIBRARIES} gpuarray) From 5c597565e724a0382e65f7c248b3ff8bfd9e5b45 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 5 Oct 2016 16:25:17 -0400 Subject: [PATCH 037/597] Fix ISO C90 declarations-after-code warnings. --- src/gpuarray_reduction.c | 60 +++++++++++++++++++++---------------- src/util/integerfactoring.c | 23 ++++++++------ 2 files changed, 48 insertions(+), 35 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 72438b5668..679e4b7648 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -662,7 +662,20 @@ static int maxandargmaxCompile (maxandargmax_ctx* ctx){ */ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ - int i; + int i; + size_t warpMod; + size_t bestWarpMod = 1; + unsigned bestWarpAxis = 0; + uint64_t maxLg; + uint64_t maxLs[3]; + uint64_t maxGg; + uint64_t maxGs[3]; + uint64_t dims [3]; + double slack[3]; + ga_factor_list factBS[3]; + ga_factor_list factGS[3]; + ga_factor_list factCS[3]; + /** * Obtain the constraints of our problem. @@ -690,17 +703,12 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ * - Finding on which hardware axis is it optimal to place the warpSize factor. */ - unsigned bestWarpAxis = 0; - size_t bestWarpMod = 1; - uint64_t maxLg = maxL; - uint64_t maxLs[3] = {maxL0, maxL1, maxL2}; - uint64_t maxGg = maxG; - uint64_t maxGs[3] = {maxG0, maxG1, maxG2}; - uint64_t dims [3] = {1, 1, 1 }; - double slack[3] = {1.1, 1.1, 1.1 }; - ga_factor_list factBS[3]; - ga_factor_list factGS[3]; - ga_factor_list factCS[3]; + maxLg = maxL; + maxLs[0] = maxL0, maxLs[1]=maxL1, maxLs[2]=maxL2; + maxGg = maxG; + maxGs[0] = maxG0, maxGs[1]=maxG1, maxGs[2]=maxG2; + dims[0] = dims[1] = dims[2] = 1; + slack[0] = slack[1] = slack[2] = 1.1; for(i=0;indh;i++){ dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]]; @@ -708,7 +716,7 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ gaIFLInit(&factGS[i]); gaIFLInit(&factCS[i]); - size_t warpMod = dims[i]%warpSize; + warpMod = dims[i]%warpSize; if(bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){ bestWarpAxis = i; bestWarpMod = warpMod; @@ -761,6 +769,8 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ */ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ + void* args[11]; + /** * Argument Marshalling. This the grossest gross thing in here. */ @@ -776,19 +786,17 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ ctx->dstMax->strides, flags, 0); ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstArgmax->strides, flags, 0); - void* args[] = { - (void*) ctx->src->data, - (void*)&ctx->src->offset, - (void*) ctx->srcStepsGD, - (void*) ctx->srcSizeGD, - (void*) ctx->chunkSizeGD, - (void*) ctx->dstMax->data, - (void*)&ctx->dstMax->offset, - (void*) ctx->dstMaxStepsGD, - (void*) ctx->dstArgmax->data, - (void*)&ctx->dstArgmax->offset, - (void*) ctx->dstArgmaxStepsGD - }; + args[ 0] = (void*) ctx->src->data; + args[ 1] = (void*)&ctx->src->offset; + args[ 2] = (void*) ctx->srcStepsGD; + args[ 3] = (void*) ctx->srcSizeGD; + args[ 4] = (void*) ctx->chunkSizeGD; + args[ 5] = (void*) ctx->dstMax->data; + args[ 6] = (void*)&ctx->dstMax->offset; + args[ 7] = (void*) ctx->dstMaxStepsGD; + args[ 8] = (void*) ctx->dstArgmax->data; + args[ 9] = (void*)&ctx->dstArgmax->offset; + args[10] = (void*) ctx->dstArgmaxStepsGD; if(ctx->srcStepsGD && ctx->srcSizeGD && diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 3667a317d7..b3ff337cc6 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -211,6 +211,8 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ } static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ + uint64_t r; + /** * Special cases (order matters!): * - A modulo of 0 makes no sense and a modulo of 1 implies a return value @@ -241,7 +243,7 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ * Otherwise, perform modular exponentiation by squaring. */ - uint64_t r = 1; + r = 1; while(a){ if(a&1){ r = gaIMulMod(r, x, m); @@ -255,6 +257,13 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ } int gaIIsPrime (uint64_t n){ + size_t i, j; + int hasNoSmallFactors, hasSmallFactors; + uint64_t r, d; + const uint64_t WITNESSES[] = {2,3,5,7,11,13,17,19,23,29,31,37}; + const int NUMWITNESSES = sizeof(WITNESSES)/sizeof(WITNESSES[0]); + + /** * Check if it is 2, the oddest prime. */ @@ -286,8 +295,8 @@ int gaIIsPrime (uint64_t n){ * Test small prime factors. */ - int hasNoSmallFactors = n%3 && n%5 && n%7 && n%11 && n%13; - int hasSmallFactors = !hasNoSmallFactors; + hasNoSmallFactors = n%3 && n%5 && n%7 && n%11 && n%13; + hasSmallFactors = !hasNoSmallFactors; if(hasSmallFactors){ return 0; } @@ -304,12 +313,8 @@ int gaIIsPrime (uint64_t n){ * integers under 2^64. */ - const uint64_t WITNESSES[] = {2,3,5,7,11,13,17,19,23,29,31,37}; - const int NUMWITNESSES = sizeof(WITNESSES)/sizeof(WITNESSES[0]); - size_t i, j; - - uint64_t r = gaICtz(n-1); - uint64_t d = (n-1)>>r; + r = gaICtz(n-1); + d = (n-1)>>r; /* For each witness... */ for(i=0;i Date: Thu, 11 Aug 2016 18:32:40 -0400 Subject: [PATCH 038/597] Fix sync() for single stream mode. --- src/gpuarray_buffer_cuda.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 67b931c894..b5e3c6d41c 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1388,12 +1388,16 @@ static int cuda_sync(gpudata *b) { ASSERT_BUF(b); cuda_enter(ctx); - ctx->err = cuEventSynchronize(b->wev); - if (ctx->err != CUDA_SUCCESS) - err = GA_IMPL_ERROR; - ctx->err = cuEventSynchronize(b->rev); - if (ctx->err != CUDA_SUCCESS) - err = GA_IMPL_ERROR; + if (ctx->flags & GA_CTX_SINGLE_STREAM) { + cuStreamSynchronize(ctx->s); + } else { + ctx->err = cuEventSynchronize(b->wev); + if (ctx->err != CUDA_SUCCESS) + err = GA_IMPL_ERROR; + ctx->err = cuEventSynchronize(b->rev); + if (ctx->err != CUDA_SUCCESS) + err = GA_IMPL_ERROR; + } cuda_exit(ctx); return err; } From 048546f1343c2b2c481ad5ae144d4de29040ec49 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 6 Oct 2016 15:04:15 -0400 Subject: [PATCH 039/597] Fix crash in gpucomm_free(). --- src/gpuarray_buffer_collectives.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_collectives.c b/src/gpuarray_buffer_collectives.c index 3a55c4fcfb..3bb423307f 100644 --- a/src/gpuarray_buffer_collectives.c +++ b/src/gpuarray_buffer_collectives.c @@ -14,7 +14,9 @@ int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, } void gpucomm_free(gpucomm* comm) { - gpucontext* ctx = gpucomm_context(comm); + gpucontext* ctx; + if (comm == NULL) return; + ctx = gpucomm_context(comm); if (ctx->comm_ops != NULL) ctx->comm_ops->comm_free(comm); } From 2687463620f64441a2be502bfab6cb1146c9894e Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 10 Oct 2016 15:37:38 -0400 Subject: [PATCH 040/597] Avoid using VLAs in integerfactoring.c. Use instead malloc() for systems allergic to C99 VLAs. --- src/util/integerfactoring.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index b3ff337cc6..476a616b43 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -1,10 +1,18 @@ /* Includes */ #include #include +#include #include #include "integerfactoring.h" +/* Detect when to avoid VLAs. */ +#if defined(_MSC_VER) || defined(__STDC_NO_VLA__) +#define GA_USING_MALLOC_FOR_VLA 1 +#endif + + + /** * Static Function Prototypes @@ -826,8 +834,12 @@ static void gaIFLScheduleOpt(const int n, const uint64_t maxTot, const uint64_t* maxInd){ int i, j, k; - uint64_t maxFTot, maxFInd, currF, f; - uint64_t pInd[n], pTot = 1; + uint64_t maxFTot, maxFInd, currF, f, pTot = 1; +#if GA_USING_MALLOC_FOR_VLA + uint64_t* pInd = malloc(n * sizeof(uint64_t)); +#else + uint64_t pInd[n]; +#endif /* Muzzle compiler about a random function being unused. */ (void)gaIFLGetGreatestFactorv; @@ -883,4 +895,8 @@ static void gaIFLScheduleOpt(const int n, pTot *= f; maxFTot = maxTot/pTot; }while(maxFTot>1 && f>1); + +#if GA_USING_MALLOC_FOR_VLA + free(pInd); +#endif } From 869410141033be5a9573870e20543101cded01de Mon Sep 17 00:00:00 2001 From: Ted Ying Date: Tue, 11 Oct 2016 11:57:55 -0400 Subject: [PATCH 041/597] Add CLBlast support --- CMakeModules/FindCLBlast.cmake | 35 +++ src/CMakeLists.txt | 8 + src/gpuarray_blas_opencl_clblast.c | 469 +++++++++++++++++++++++++++++ src/gpuarray_buffer_opencl.c | 11 + src/gpuarray_reduction.c | 16 +- 5 files changed, 531 insertions(+), 8 deletions(-) create mode 100644 CMakeModules/FindCLBlast.cmake create mode 100644 src/gpuarray_blas_opencl_clblast.c diff --git a/CMakeModules/FindCLBlast.cmake b/CMakeModules/FindCLBlast.cmake new file mode 100644 index 0000000000..2ab0f54033 --- /dev/null +++ b/CMakeModules/FindCLBlast.cmake @@ -0,0 +1,35 @@ +# - Try to find CLBlast +# Once done this will define +# +# CLBLAST_FOUND - system has CLBlast +# CLBLAST_INCLUDE_DIRS - location of CLBlast.h +# CLBLAST_LIBRARIES - location of libCLBlast + +IF(CLBLAST_INCLUDE_DIRS) + # Already in cache, be silent + set (CLBLAST_FIND_QUIETLY TRUE) +ENDIF (CLBLAST_INCLUDE_DIRS) + +FIND_PATH(CLBLAST_ROOT_DIR + NAMES include/clblast_c.h + HINTS /usr/local/ $ENV{CLBLAST_ROOT} + DOC "CLBlast root directory.") + +FIND_PATH(_CLBLAST_INCLUDE_DIRS + NAMES clblast_c.h + HINTS ${CLBLAST_ROOT_DIR}/include + DOC "CLBlast Include directory") + +FIND_LIBRARY(_CLBLAST_LIBRARY + NAMES libclblast.so + HINTS ${CLBLAST_ROOT_DIR}/lib ${CLBLAST_ROOT_DIR}/lib64 ${CLBLAST_ROOT_DIR}/lib32 + DOC "CLBlast lib directory") + +SET(CLBLAST_INCLUDE_DIRS ${_CLBLAST_INCLUDE_DIRS}) +SET(CLBLAST_LIBRARIES ${_CLBLAST_LIBRARY}) + +# handle the QUIETLY and REQUIRED arguments and set CLBLAST_FOUND to TRUE if +# all listed variables are TRUE +INCLUDE (FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(CLBLAST DEFAULT_MSG CLBLAST_LIBRARIES CLBLAST_INCLUDE_DIRS) +MARK_AS_ADVANCED(CLBLAST_LIBRARIES CLBLAST_INCLUDE_DIRS) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d70726061c..1a8855d74a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,6 +9,9 @@ find_package(CUDA) find_package(OpenCL) if(OpenCL_FOUND) find_package(clBLAS) +if(NOT CLBLAS_FOUND) +find_package(CLBlast) +endif() endif() if(CUDA_FOUND) find_package(NCCL) @@ -144,6 +147,11 @@ if(OpenCL_FOUND) list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblas.c) add_definitions(-DWITH_OPENCL_CLBLAS) include_directories(${CLBLAS_INCLUDE_DIRS}) + elseif(CLBLAS_FOUND) + message(STATUS "Building with CLBLAST") + list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblast.c) + add_definitions(-DWITH_OPENCL_CLBLAST) + include_directories(${CLBLAST_INCLUDE_DIRS}) endif() endif() diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c new file mode 100644 index 0000000000..22eda8ed14 --- /dev/null +++ b/src/gpuarray_blas_opencl_clblast.c @@ -0,0 +1,469 @@ +#include "private.h" +#include "private_opencl.h" + +#include + +#include "gpuarray/buffer_blas.h" +#include "gpuarray/error.h" + +static inline Layout convO(cb_order order) { + switch (order) { + case cb_row: + return kRowMajor; + case cb_column: + return kColMajor; + default: + return -1; + } +} + +static inline Transpose convT(cb_transpose trans) { + switch (trans) { + case cb_no_trans: + return kNo; + case cb_trans: + return kYes; + case cb_conj_trans: + return kConjugate; + default: + return -1; + } +} + +static int setup(gpucontext *ctx) { + return GA_NO_ERROR; +} + +static void teardown(gpucontext *ctx) { +} + +static const char *error(gpucontext *ctx) { + return "(clblast) error in blas call, no details for now."; +} + +#define ARRAY_INIT(A) \ + if (A->ev != NULL) \ + clWaitForEvents(1, &A->ev) + +#define ARRAY_FINI(A) \ + if (A->ev != NULL) \ + clReleaseEvent(A->ev); \ + A->ev = ev; \ + clRetainEvent(A->ev) + +static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata **A, size_t *offA, size_t lda, + gpudata **B, size_t *offB, size_t ldb, + float beta, gpudata **C, size_t *offC, size_t ldc, + size_t batchCount) { + cl_ctx *ctx = A[0]->ctx; + cl_event ev; + size_t i; + cl_uint num_ev = 0; + StatusCode err; + + for (i = 0; i < batchCount; i++) { + ARRAY_INIT(A[i]); + ARRAY_INIT(B[i]); + ARRAY_INIT(C[i]); + err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, + (half)alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, + (half)beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + ARRAY_FINI(A[i]); + ARRAY_FINI(B[i]); + ARRAY_FINI(C[i]); + clReleaseEvent(ev); + } + + return GA_NO_ERROR; +} + +static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata **A, size_t *offA, size_t lda, + gpudata **B, size_t *offB, size_t ldb, + float beta, gpudata **C, size_t *offC, size_t ldc, + size_t batchCount) { + cl_ctx *ctx = A[0]->ctx; + cl_event ev; + size_t i; + cl_uint num_ev = 0; + StatusCode err; + + for (i = 0; i < batchCount; i++) { + ARRAY_INIT(A[i]); + ARRAY_INIT(B[i]); + ARRAY_INIT(C[i]); + err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, + alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, + beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + ARRAY_FINI(A[i]); + ARRAY_FINI(B[i]); + ARRAY_FINI(C[i]); + clReleaseEvent(ev); + } + + return GA_NO_ERROR; +} + +static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, double alpha, + gpudata **A, size_t *offA, size_t lda, + gpudata **B, size_t *offB, size_t ldb, + double beta, gpudata **C, size_t *offC, size_t ldc, + size_t batchCount) { + cl_ctx *ctx = A[0]->ctx; + cl_event ev; + size_t i; + cl_uint num_ev = 0; + StatusCode err; + + for (i = 0; i < batchCount; i++) { + ARRAY_INIT(A[i]); + ARRAY_INIT(B[i]); + ARRAY_INIT(C[i]); + err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, + alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, + beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + ARRAY_FINI(A[i]); + ARRAY_FINI(B[i]); + ARRAY_FINI(C[i]); + clReleaseEvent(ev); + } + + return GA_NO_ERROR; +} + +static int hgemvBatch(cb_order order, cb_transpose transA, + size_t M, size_t N, float alpha, + gpudata **A, size_t *offA, size_t lda, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, + size_t batchCount, int flags) { + return GA_DEVSUP_ERROR; +} + +static int sgemvBatch(cb_order order, cb_transpose transA, + size_t M, size_t N, float alpha, + gpudata **A, size_t *offA, size_t lda, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, + size_t batchCount, int flags) { + return GA_DEVSUP_ERROR; +} + +static int dgemvBatch(cb_order order, cb_transpose transA, + size_t M, size_t N, double alpha, + gpudata **A, size_t *offA, size_t lda, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, + size_t batchCount, int flags) { + return GA_DEVSUP_ERROR; +} + +static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, + gpudata **A, size_t *offA, size_t lda, + size_t batchCount, int flags) { + return GA_DEVSUP_ERROR; +} + +static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, + gpudata **A, size_t *offA, size_t lda, + size_t batchCount, int flags) { + return GA_DEVSUP_ERROR; +} + +static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, + gpudata **A, size_t *offA, size_t lda, + size_t batchCount, int flags) { + return GA_DEVSUP_ERROR; +} + +static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, + float alpha, gpudata *A, size_t offA, size_t lda, + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { + cl_ctx *ctx = A->ctx; + StatusCode err; + cl_uint num_ev = 0; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(X); + ARRAY_INIT(Y); + + err = CLBlastHgemv(convO(order), convT(transA), M, N, (half)alpha, + A->buf, offA, lda, X->buf, offX, incX, + (half)beta, Y->buf, offY, incY, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(X); + ARRAY_FINI(Y); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, + float alpha, gpudata *A, size_t offA, size_t lda, + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { + cl_ctx *ctx = A->ctx; + StatusCode err; + cl_uint num_ev = 0; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(X); + ARRAY_INIT(Y); + + err = CLBlastSgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(X); + ARRAY_FINI(Y); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, + double alpha, gpudata *A, size_t offA, size_t lda, + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY) { + cl_ctx *ctx = A->ctx; + StatusCode err; + cl_uint num_ev = 0; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(X); + ARRAY_INIT(Y); + + err = CLBlastDgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(X); + ARRAY_FINI(Y); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, float beta, + gpudata *C, size_t offC, size_t ldc) { + cl_ctx *ctx = A->ctx; + StatusCode err; + cl_uint num_ev = 0; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(B); + ARRAY_INIT(C); + + err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, + (half)alpha, A->buf, offA, lda, B->buf, offB, ldb, + (half)beta, C->buf, offC, ldc, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(B); + ARRAY_FINI(C); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, float beta, + gpudata *C, size_t offC, size_t ldc) { + cl_ctx *ctx = A->ctx; + StatusCode err; + cl_uint num_ev = 0; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(B); + ARRAY_INIT(C); + + err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, + alpha, A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(B); + ARRAY_FINI(C); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, double alpha, + gpudata *A, size_t offA, size_t lda, + gpudata *B, size_t offB, size_t ldb, double beta, + gpudata *C, size_t offC, size_t ldc) { + cl_ctx *ctx = A->ctx; + StatusCode err; + cl_uint num_ev = 0; + cl_event ev; + + ARRAY_INIT(A); + ARRAY_INIT(B); + ARRAY_INIT(C); + + err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, + alpha, A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(A); + ARRAY_FINI(B); + ARRAY_FINI(C); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int hger(cb_order order, size_t M, size_t N, float alpha, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *A, size_t offA, size_t lda) { + cl_ctx *ctx = X->ctx; + cl_event ev; + cl_uint num_ev = 0; + StatusCode err; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(A); + + err = CLBlastHger(convO(order), M, N, (half)alpha, X->buf, offX, incX, + Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(A); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int sger(cb_order order, size_t M, size_t N, float alpha, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *A, size_t offA, size_t lda) { + cl_ctx *ctx = X->ctx; + cl_event ev; + cl_uint num_ev = 0; + StatusCode err; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(A); + + err = CLBlastSger(convO(order), M, N, alpha, X->buf, offX, incX, + Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(A); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +static int dger(cb_order order, size_t M, size_t N, double alpha, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *A, size_t offA, size_t lda) { + cl_ctx *ctx = X->ctx; + cl_event ev; + cl_uint num_ev = 0; + StatusCode err; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(A); + + err = CLBlastDger(convO(order), M, N, alpha, X->buf, offX, incX, + Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(A); + + clReleaseEvent(ev); + + return GA_NO_ERROR; +} + +GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { + setup, + teardown, + error, + hgemv, + sgemv, + dgemv, + hgemm, + sgemm, + dgemm, + hger, + sger, + dger, + hgemmBatch, + sgemmBatch, + dgemmBatch, + hgemvBatch, /* TODO */ + sgemvBatch, /* TODO */ + dgemvBatch, /* TODO */ + hgerBatch, /* TODO */ + sgerBatch, /* TODO */ + dgerBatch, /* TODO */ +}; diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 1f913aed2c..c8917d3778 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -476,11 +476,13 @@ static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags, if (flags & GA_BUFFER_READ_ONLY) { if (flags & GA_BUFFER_WRITE_ONLY) FAIL(NULL, GA_VALUE_ERROR); + clflags &= ~CL_MEM_READ_WRITE; clflags |= CL_MEM_READ_ONLY; } if (flags & GA_BUFFER_WRITE_ONLY) { if (flags & GA_BUFFER_READ_ONLY) FAIL(NULL, GA_VALUE_ERROR); + clflags &= ~CL_MEM_READ_WRITE; clflags |= CL_MEM_WRITE_ONLY; } @@ -1113,6 +1115,10 @@ static int cl_transfer(gpudata *dst, size_t dstoff, #ifdef WITH_OPENCL_CLBLAS extern gpuarray_blas_ops clblas_ops; +#else +#ifdef WITH_OPENCL_CLBLAST +extern gpuarray_blas_ops clblast_ops; +#endif #endif static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, @@ -1238,9 +1244,14 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, #ifdef WITH_OPENCL_CLBLAS *((gpuarray_blas_ops **)res) = &clblas_ops; return GA_NO_ERROR; +#else +#ifdef WITH_OPENCL_CLBLAST + *((gpuarray_blas_ops **)res) = &clblast_ops; + return GA_NO_ERROR; #else *((void **)res) = NULL; return GA_DEVSUP_ERROR; +#endif #endif case GA_CTX_PROP_COMM_OPS: diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 679e4b7648..0c05e14397 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -334,17 +334,17 @@ static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\n"); } static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx){ - strb_appends(&ctx->s, "KERNEL void maxandargmax(const T* src,\n"); + strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T* src,\n"); strb_appends(&ctx->s, " const X srcOff,\n"); - strb_appends(&ctx->s, " const X* srcSteps,\n"); - strb_appends(&ctx->s, " const X* srcSize,\n"); - strb_appends(&ctx->s, " const X* chunkSize,\n"); - strb_appends(&ctx->s, " T* dstMax,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSteps,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSize,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* chunkSize,\n"); + strb_appends(&ctx->s, " GLOBAL_MEM T* dstMax,\n"); strb_appends(&ctx->s, " const X dstMaxOff,\n"); - strb_appends(&ctx->s, " const X* dstMaxSteps,\n"); - strb_appends(&ctx->s, " X* dstArgmax,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* dstMaxSteps,\n"); + strb_appends(&ctx->s, " GLOBAL_MEM X* dstArgmax,\n"); strb_appends(&ctx->s, " const X dstArgmaxOff,\n"); - strb_appends(&ctx->s, " const X* dstArgmaxSteps)"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* dstArgmaxSteps)"); } static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t/* Add offsets */\n"); From 10ce9cbda187de1cf1c95433eca7ad5219957c54 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 11 Oct 2016 20:36:28 -0400 Subject: [PATCH 042/597] Remove depency on snprintf. --- src/util/integerfactoring.c | 330 +++++++++++++++++------------------- src/util/integerfactoring.h | 65 ++++--- 2 files changed, 185 insertions(+), 210 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 476a616b43..ff4cd1d9a0 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -20,7 +20,7 @@ /** * @brief Count trailing zeros of a 64-bit integer. - * + * * @param [in] n The integer whose trailing zero count is to be computed. * @return If n != 0, returns trailing zero count; Else returns 64. */ @@ -29,7 +29,7 @@ static int gaICtz(uint64_t n); /** * @brief Count leading zeros of a 64-bit integer. - * + * * @param [in] n The integer whose leading zero count is to be computed. * @return If n != 0, returns leading zero count; Else returns 64. */ @@ -38,11 +38,11 @@ static int gaIClz(uint64_t n); /** * @brief Integer Modular Multiplication. - * + * * Computes - * + * * $$a*b \pmod m$$ - * + * * efficiently for 64-bit unsigned integers a, b, m. */ @@ -50,11 +50,11 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m); /** * @brief Integer Modular Exponentiation. - * + * * Computes - * + * * $$x^a \pmod m$$ - * + * * efficiently for 64-bit unsigned integers x, a, m. */ @@ -108,11 +108,11 @@ static int gaICtz (uint64_t n){ return n ? __builtin_ctzll(n) : 64; #else int z; - + for(z=0;z<64;z++){ if((n>>z) & 1){break;} } - + return z; #endif } @@ -122,11 +122,11 @@ static int gaIClz (uint64_t n){ return n ? __builtin_clzll(n) : 64; #else int z; - + for(z=63;z>=0;z--){ if((n>>z) & 1){break;} } - + return 63-z; #endif } @@ -134,7 +134,7 @@ static int gaIClz (uint64_t n){ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ #if (__GNUC__ >= 4) && defined(__x86_64__) uint64_t r; - + asm( "mul %2\n\t" "div %3\n\t" @@ -142,7 +142,7 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ : "a"(a), "r"(b), "r"(m) /* Inputs */ : "cc" ); - + return r; #elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) /* Hardcore GCC 4.6+ optimization jazz */ @@ -150,44 +150,44 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ #else const uint64_t TWOPOW32 = (uint64_t)1<<32; int i; - + a %= m; b %= m; - + if(m <= TWOPOW32){ /** * Fast path: When performing modulo arithmetic on values <= 2^32, * (a*b) % m gives the correct answer. */ - + return (a*b) % m; }else{ /** * Slow path: Have to simulate 128-bit arithmetic long division. */ - + uint64_t ah = a>>32; uint64_t al = (uint32_t)a; uint64_t bh = b>>32; uint64_t bl = (uint32_t)b; - + uint64_t ahbh = ah*bh; uint64_t ahbl = ah*bl; uint64_t albh = al*bh; uint64_t albl = al*bl; - + uint64_t md = ahbl+albh; - + uint64_t lo = albl + (md<<32); uint64_t hi = ahbh + (md>>32); - + /* Propagate carry-outs from `md` and `lo` into `hi` */ if(lo < albl){hi++;} if(md < ahbl){hi+=TWOPOW32;} - + /** * Begin 128-bit-by-64-bit remainder. - * + * * 1) Cut down `hi` mod `m`. This implements the first few iterations * of a shift-and-subtract loop, leaving only 64 iterations to go. * 2) Iterate 64 times: @@ -201,18 +201,18 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ * order to bring back newHi within the range [0, m). * 3) The modulo is in hi. */ - + hi %= m; for(i=0;i<64;i++){ uint64_t newLo = (lo<<1); uint64_t newHi = (hi<<1) + (newLo m){newHi -= m;} - + hi = newHi; lo = newLo; } - + return hi; } #endif @@ -220,7 +220,7 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ uint64_t r; - + /** * Special cases (order matters!): * - A modulo of 0 makes no sense and a modulo of 1 implies a return value @@ -230,13 +230,13 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ * - An exponent of 1 requires a return value of x. * - An exponent of 2 can be handled by the modulo multiplication directly. */ - + if(m<=1){ return 0; } - + x %= m; - + if(a==0){ return 1; }else if(x<=1){ @@ -246,21 +246,21 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ }else if(a==2){ return gaIMulMod(x,x,m); } - + /** * Otherwise, perform modular exponentiation by squaring. */ - + r = 1; while(a){ if(a&1){ r = gaIMulMod(r, x, m); } - + x = gaIMulMod(x, x, m); a >>= 1; } - + return r; } @@ -270,24 +270,24 @@ int gaIIsPrime (uint64_t n){ uint64_t r, d; const uint64_t WITNESSES[] = {2,3,5,7,11,13,17,19,23,29,31,37}; const int NUMWITNESSES = sizeof(WITNESSES)/sizeof(WITNESSES[0]); - - + + /** * Check if it is 2, the oddest prime. */ - + if(n==2){return 1;} - + /** * Check if it is an even integer. */ - + if((n&1) == 0){return 0;} - + /** * For small integers, read directly the answer in a table. */ - + if(n<256){ return "nnyynynynnnynynnnynynnnynnnnnyny" "nnnnnynnnynynnnynnnnnynnnnnynynn" @@ -298,20 +298,20 @@ int gaIIsPrime (uint64_t n){ "nynnnynynnnnnnnnnnnynnnnnnnnnnny" "nnnynynnnynnnnnynynnnnnnnnnynnnn"[n] == 'y'; } - + /** * Test small prime factors. */ - + hasNoSmallFactors = n%3 && n%5 && n%7 && n%11 && n%13; hasSmallFactors = !hasNoSmallFactors; if(hasSmallFactors){ return 0; } - + /** * Otherwise proceed to the Miller-Rabin test. - * + * * The Miller-Rabin test uses integer "witnesses" in an attempt at * proving the number composite. Should it fail to prove an integer * composite, it reports the number as "probably prime". However, if @@ -320,33 +320,33 @@ int gaIIsPrime (uint64_t n){ * 2 to 37 in order to ensure the correctness of the identifications for * integers under 2^64. */ - + r = gaICtz(n-1); d = (n-1)>>r; - + /* For each witness... */ for(i=0;i 0 && maxN < n)){ return 0; } - + /** * Handle special cases of n = 0,1,2. */ - + if(n<=2){ gaIFLInit(fl); gaIFLAddFactors(fl, n, 1); return 1; } - + /** * Magic-value arguments interpreted and canonicalized. */ - + if(maxN == (uint64_t)-1 || gaIClz(maxN) < gaIClz(n)){ /** * Either we are allowed unlimited growth of n, or the slack space @@ -404,71 +404,71 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl * automatically satisfy the most stringent possible smoothness * constraint. */ - + return gaIFactorizeNextPow2(n, fl); }else if(maxN == 0){ /** * We are asked for a strict factoring. */ - + maxN = n; } - + if(k == 0 || k >= n){ /** * We want no k-smoothness constraint. */ - + k = n; } - - + + /** * Master loop. */ - + for(i=n; i <= maxN; i++){ /** * Do not manipulate the loop index! * Initial subfactor to cut down is x=i. */ - + x = i; gaIFLInit(fl); - + /** * Subfactorization always begins with an attempt at an initial * cut-down by factors of 2. Should this result in a 1 (which isn't * technically prime, but indicates a complete factorization), we * report success. */ - + subfactorize: gaIFLAddFactors(fl, 2, gaICtz(x)); x >>= gaICtz(x); f = 3; - + /** * Primality test. - * + * * If the remaining factor x is a prime number, it's decision time. One * of two things is true: - * + * * 1) We have a smoothness constraint k and x is <= than it, or we * don't have a smoothness constraint at all (k==n). Both cases are * covered by checking x<=k. - * + * * In this case we add x as the last factor to the factor list and * return affirmatively. - * + * * 2) We have a smoothness constraint and x>k. - * + * * In this case we have to increment x and begin anew the * sub-factorization. This may cause us to fail out of factorizing * the current i, by exceeding our slack limit. If this happens we * abort the factorization rooted at i and move to the next i. */ - + primetest: if(x==1 || gaIIsPrime(x)){ if(x<=k){ @@ -484,16 +484,16 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl } } } - + /** * Composite number handler. - * + * * We continue by trying to cut down x by factors of 3+. Should a trial * division by a factor f succeed, all powers of f are factored out of * x and once f no longer divides x evenly, a new primality test is * run. The primality test will be invoked at most 15 times from this loop. */ - + for(;f<=k && f*f<=x && f<=0xFFFFFFFFU;f+=2){/* Overflow-safe f*f */ if(x%f == 0){ c = 0; @@ -501,17 +501,17 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl x /= f; c++; }while(x%f == 0); - + gaIFLAddFactors(fl, f, c); - + goto primetest; } } - + /* Check before next iteration for 64-bit integer overflow. */ nextI: if(i == 0xFFFFFFFFFFFFFFFF){break;} } - + /* Failed to factorize. */ return 0; } @@ -525,10 +525,10 @@ static int gaIFactorizeNextPow2(uint64_t n, ga_factor_list* fl){ n |= n >> 16; n |= n >> 32; n++; - + gaIFLInit(fl); gaIFLAddFactors(fl, 2, gaICtz(n)); - + return 1; } @@ -542,37 +542,37 @@ int gaIFLFull(ga_factor_list* fl){ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){ int i; - + /** * Fast case: We're adding 0 powers of f, or any powers of 1. The * value of the factor list (and the integer it represents) is thus * unchanged. */ - + if(p == 0 || f == 1){ return 1; } - + /** * Otherwise, the factor list has to change. We scan linearly the factor * list for either a pre-existing spot or an insertion spot. Scanning * linearly over a 15-element array is faster and less complex than binary * search. */ - + for(i=0;id;i++){ if(fl->f[i] == f){ /** * Factor is already in list. */ - + fl->p[i] += p; if(fl->p[i] == 0){ /** * We removed all factors f. Bump leftwards the remainder to * maintain sorted order. */ - + memmove(&fl->f[i], &fl->f[i+1], sizeof(fl->f[i])*(fl->d-i)); memmove(&fl->p[i], &fl->p[i+1], sizeof(fl->p[i])*(fl->d-i)); fl->d--; @@ -581,12 +581,12 @@ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){ }else if(fl->f[i] > f){ /* Inject the factor at this place in order to keep list sorted, if we have the capacity. */ - + if(gaIFLFull(fl)){ /* We can't bump the list rightwards, it's full already! */ return 0; } - + memmove(&fl->f[i+1], &fl->f[i], sizeof(fl->f[i])*(fl->d-i)); memmove(&fl->p[i+1], &fl->p[i], sizeof(fl->p[i])*(fl->d-i)); fl->f[i] = f; @@ -595,15 +595,15 @@ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){ return 1; } } - + /** * We looked at every factor in the list and f is strictly greater than * all of them. - * + * * If the list is full, we cannot insert f, but if it isn't, we can simply * tack it at the end. */ - + if(gaIFLFull(fl)){ return 0; }else{ @@ -616,26 +616,26 @@ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){ int gaIFLGetFactorPower(ga_factor_list* fl, uint64_t f){ int i; - + for(i=0;id;i++){ if(fl->f[i] == f){ return fl->p[i]; } } - + return 0; } uint64_t gaIFLGetProduct(const ga_factor_list* fl){ uint64_t p = 1; int i, j; - + for(i=0;id;i++){ for(j=0;jp[i];j++){ p *= fl->f[i]; } } - + return p; } @@ -650,20 +650,20 @@ uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl){ static uint64_t gaIFLGetProductv(int n, const ga_factor_list* fl){ uint64_t p = 1; int i; - + for(i=0;i 0){ hasFactors = 1; @@ -674,16 +674,16 @@ static uint64_t gaIFLGetGreatestFactorv(int n, const ga_factor_list* fl, int* id } } } - + return hasFactors ? f : 1; } static uint64_t gaIFLGetSmallestFactorv(int n, const ga_factor_list* fl, int* idx){ uint64_t f = -1, currF; int i, hasFactors=0; - + if(idx){*idx = 0;} - + for(i=0;i 0){ hasFactors = 1; @@ -694,48 +694,26 @@ static uint64_t gaIFLGetSmallestFactorv(int n, const ga_factor_list* fl, int* id } } } - + return hasFactors ? f : 1; } -int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl){ - int i, j; - - int total = 0; - size_t left = size; - char* ptr = size ? str : NULL; - - /* Loop over all factors and spit them out. */ - for(i=0;id;i++){ - for(j=0;jp[i];j++){ - total += snprintf(ptr, left, "%llu*", (unsigned long long)fl->f[i]); - if(ptr){ - left -= strlen(ptr); - ptr += strlen(ptr); - } - } - } - - /* If no factors were printed, print 1. */ - if(total == 0){ - total += snprintf(ptr, left, "1*"); - if(ptr){ - left -= strlen(ptr); - ptr += strlen(ptr); - } - } - - /* Terminate buffer ('*' -> '\0') and deduct one character. */ - total--; - if(str && size > 0){ - if(total >= size){ - str[size-1] = '\0'; - }else{ - str[total] = '\0'; - } - } - - return total; +void gaIFLappend(strb *sb, const ga_factor_list* fl){ + int i, j; + /* Loop over all factors and spit them out. */ + for (i = 0; i < fl->d; i++) { + for (j = 0; j < fl->p[i]; j++) { + strb_appendf(sb, "%llu*", (unsigned long long)fl->f[i]); + } + } + + /* If no factors were printed, print 1. */ + if (i == 0 && j == 0) { + strb_appendf(sb, "1*"); + } + + /* Deduct final '*'. */ + sb->l -= 1; } void gaIFLSchedule(const int n, @@ -749,38 +727,38 @@ void gaIFLSchedule(const int n, /** * If we have zero dimensions, the scheduling job is easy. */ - + if(n<=0){return;} - + /** * First, we move factors from factBS[i] and factGS[i] to factCS[i], in * order of largest to smallest, until their product is at or below * maxBind[i] and maxGind[i] respectively. */ - + gaIFLScheduleSatisfyInd(n, factBS, factCS, maxBind); gaIFLScheduleSatisfyInd(n, factGS, factCS, maxGind); - + /** * Then we move out more factors from factBS[i] and factGS[i], in order of * smallest to largest, until their common product is at or below maxBtot * and maxGtot respectively. */ - + gaIFLScheduleSatisfyTot(n, factBS, factCS, maxBtot); gaIFLScheduleSatisfyTot(n, factGS, factCS, maxGtot); - + /** * At this point, the scheduling is guaranteed to be valid, but may be * nowhere close to optimal. - * + * * So we start moving in factors from factCS[i] to factBS[i], in order of * largest to smallest, while remaining below maxBtot and maxBind[i]. - * + * * Lastly, we move in factors from factCS[i] to factBG[i], in order of * largest to smallest, while remaining below maxGtot and maxGind[i]. */ - + gaIFLScheduleOpt(n, factCS, factBS, maxBtot, maxBind); gaIFLScheduleOpt(n, factCS, factGS, maxGtot, maxGind); } @@ -791,7 +769,7 @@ static void gaIFLScheduleSatisfyInd(const int n, const uint64_t* maxInd){ int i; uint64_t f, p; - + for(i=0;i maxTot){ f = gaIFLGetSmallestFactorv(n, from, &a); c = gaIFLGetFactorPower (from+a, f); - + for(i=c-1;i>=0 && p>maxTot;i--){ p /= f; gaIFLAddFactors(from+a, f, -1); @@ -840,14 +818,14 @@ static void gaIFLScheduleOpt(const int n, #else uint64_t pInd[n]; #endif - + /* Muzzle compiler about a random function being unused. */ (void)gaIFLGetGreatestFactorv; - + /** * Check whether optimization is possible. */ - + for(i=0;i=0;j--){ currF = from[i].f[j]; - + if(currF <= maxFTot && currF <= maxFInd && currF >= f){ f = currF; k = i; @@ -884,18 +862,18 @@ static void gaIFLScheduleOpt(const int n, } } } - + if(k == -1){ break; } - + gaIFLAddFactors(from+k, f, -1); gaIFLAddFactors(to +k, f, +1); pInd[k] *= f; pTot *= f; maxFTot = maxTot/pTot; }while(maxFTot>1 && f>1); - + #if GA_USING_MALLOC_FOR_VLA free(pInd); #endif diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index c6c3d6cd04..7d2339c6bb 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -7,6 +7,8 @@ #include #include +#include "util/strb.h" + /* Defines */ @@ -29,16 +31,16 @@ typedef struct ga_factor_list_ ga_factor_list; /** * @brief The GA_FACTOR_LIST struct. - * + * * Contains the list of distinct prime factors of a 64-bit unsigned integer, as * well as the powers of those factors. - * + * * There can be at most 15 such distinct factors, since the product of the * first 16 primes (2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53) exceeds * the maximum unsigned number of 2^64-1. Moreover, there can be at most 63 * factors all together, since 2^64 exceeds 2^64-1, so only an 8-bit number is * required to store the powers. - * + * * The 15th (last) element of the factor list is always 0 and has power 0, * and serves as a sort of sentinel. */ @@ -55,10 +57,10 @@ struct ga_factor_list_{ /** * @brief Checks whether an integer is prime. - * + * * @param [in] n The integer whose primality is to be checked. * @return 1 if prime; 0 if not prime. - * + * * NB: This is *not* a probabilistic primality checker. For all integers it can * be given as input, it will correctly report "prime" or "composite". * NB: Internally, this function uses the Miller-Rabin test, which *is* @@ -67,9 +69,9 @@ struct ga_factor_list_{ * Miller-Rabin "witnesses", which ensures that there are no strong * probable primes equal to or below 2^64-1 (the size of the input * argument). This set of witnesses is - * + * * $$a = 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, and 37$$ - * + * * See https://oeis.org/A014233 */ @@ -78,28 +80,28 @@ int gaIIsPrime(uint64_t n); /** * @brief Factorize a positive integer into a list of factors satisfying * certain properties. - * + * * The function factorizes a 64-bit, positive integer into a list of factors. * This factorization can be made "approximate"; That is, the product of the * factors returned can be slightly greater than the input number. The * maximum increase is controlled by a "slack" parameter maxN, as follows: - * + * * $$\texttt{n} \le \prod(\mathrm{fact}(\texttt{n}) \le \texttt{maxN}$$ - * + * * The advantage of offering some slack to the factorizer is that in return, * the factorizer may succeed in outputting a factorization with smaller * factors. The maxN slack parameter must be 0 or be greater than or equal to * n, but it is useless to set it beyond twice the value of n. - * + * * When maxN is equal to -1 (2^64 - 1), or is greater than or equal to 2n, * there is a guarantee that there exists a power of two that lies between n * and 2n. Since this factorization involves only powers of the smallest prime * (2), it is a valid factorization under any valid k-smoothness constraint, * and so will be returned. - * + * * When maxN is equal to 0 or n (no increase in value allowed), this implies * that an exact factoring is requested. - * + * * The factorization can also be constrained by a (k)-smoothness constraint. * A k-smooth number n has no prime factors greater than k. If the factorizer * is asked to factor with k-smoothness a number with prime factors greater @@ -107,11 +109,11 @@ int gaIIsPrime(uint64_t n); * number that is k-smooth and return that number's factoring. With maxN == n * and a k-smoothness constraint, this function reports whether or not the * number is k-smooth. - * + * * When k is equal to 0, equal to -1 (2^64 - 1), or is greater than or equal * to n, no k-smoothness constraints are imposed. An exact factoring is * required. - * + * * @param [in] n The integer to be factorized. Must be >0. * @param [in] maxN The "slack" parameter. The factor list returned will * not have a product that exceeds this number. @@ -131,7 +133,7 @@ int gaIFactorize(uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl) /** * @brief Initialize a factors list to all-factors- and all-powers-zero. - * + * * Such a factors list represents 1, since 0^0 = 1. */ @@ -140,7 +142,7 @@ void gaIFLInit(ga_factor_list* fl); /** * @brief Reports whether another *distinct* factor can be added to the factor * list safely. - * + * * @return Returns zero if there are less than 15 distinct factors in the list * and non-zero otherwise. */ @@ -149,13 +151,13 @@ int gaIFLFull(ga_factor_list* fl); /** * @brief Add a factor f with power p to the factor list. - * + * * If factor f was already present in the factor list, increments * the corresponding power by p. Otherwise, adds the new factor f to * the list, if there is still space, and sets the power to p. - * + * * Maintains factor list in sorted order. - * + * * @return Non-zero if factor successfully added; Zero otherwise. */ @@ -163,7 +165,7 @@ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p); /** * @brief Get the power of a given factor within a factor list. - * + * * @return The number of times a factor occurs within the current * factorization. If it does not occur, return 0. */ @@ -189,25 +191,20 @@ uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl); uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl); /** - * @brief Print out the factor list in a human-readable form, snprintf()-style. - * - * @param [out] str A string into which to print out the factor list. If the - * factor list is a result of gaIFactorize(), then the - * maximum length of buffer required is 128 bytes. - * If str is NULL, nothing is printed. - * @param [in] size The maximum number of bytes written, including the - * terminating NUL (\0) character. - * @param [in] fl The factor list to be printed. - * @return The number of characters that would have been printed - * out, assuming an unbounded, non-NULL buffer. + * @brief Print out the factor list in a human-readable form. + * + * @param [out] sb A string into which to print out the factor list. If the + * factor list is a result of gaIFactorize(), then the + * maximum length of buffer required is 128 bytes. + * @param [in] fl The factor list to be printed. */ -int gaIFLsnprintf(char* str, size_t size, const ga_factor_list* fl); +void gaIFLsnprintf(strb *sb, const ga_factor_list* fl); /** * @brief Schedule block size, grid size and what's left over that fits in * neither, which will be called "chunk" size, subject to constraints. - * + * * @param [in] n Number of dimensions of the problem. The arrays * maxBind, maxGind, factBS, factGS, factCS must have * n elements. From ecb53335b6605b88d6f3b354eca1ce65798a15e1 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 11 Oct 2016 04:57:29 -0400 Subject: [PATCH 043/597] Numerous improvements to integer factorizer. - Average-case factorization and margin massively improved. The factorizer now has optimal 2-, 3- and 5-smooth factorizers and will always attempt to use them. It will unconditionally try the optimal 5- smooth factorizer first, even when k>5 and/or slack is unlimited, usually resulting in a factorization with a product <1% greater than the original number. This compares with the just-under-100% growth for a number of the form 2^m+1 with 100% slack. - Permit a prime x during factorization to attempt decreasing its value first, then incrementing it. - New testcases and extra checks in factorization tests. - New testcases for primality tester. --- src/util/integerfactoring.c | 314 +++++++++++++++++++++++++--- src/util/integerfactoring.h | 31 ++- tests/check_util_integerfactoring.c | 196 ++++++++++++++++- 3 files changed, 495 insertions(+), 46 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index ff4cd1d9a0..340ec25ea3 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -61,10 +61,13 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m); static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); /** - * @brief Round up positive n to next power-of-2 and report its factorization. + * @brief Round up positive n to next 2-, 3- or 5-smooth number and report its + * factorization. */ -static int gaIFactorizeNextPow2(uint64_t n, ga_factor_list* fl); +static int gaIFactorize2Smooth(uint64_t n, ga_factor_list* fl); +static int gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl); +static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl); /** * @brief Satisfy individual product limits on "from" by moving factors to @@ -371,7 +374,10 @@ int gaIIsPrime (uint64_t n){ } int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl){ - uint64_t i, x, p, f, c; + int infiniteSlack, finiteSlack, greaterThanMaxN, + exactFactoring, noKSmoothness, kSmoothness; + uint64_t i, x, newX, p, f, c; + /** * Insane argument handling. @@ -381,6 +387,7 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl return 0; } + /** * Handle special cases of n = 0,1,2. */ @@ -391,40 +398,61 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl return 1; } + /** * Magic-value arguments interpreted and canonicalized. */ - if(maxN == (uint64_t)-1 || gaIClz(maxN) < gaIClz(n)){ - /** - * Either we are allowed unlimited growth of n, or the slack space - * [n, maxN] is big enough to contain a power of 2. We identify, round - * up to and factorize the next higher power of 2 greater than or equal - * to n trivially. Since powers of 2 are by definition 2-smooth, we - * automatically satisfy the most stringent possible smoothness - * constraint. - */ + exactFactoring = (maxN == 0); + infiniteSlack = (maxN == -1); + noKSmoothness = (k == 0) || (k >= n); + finiteSlack = !infiniteSlack; + kSmoothness = !noKSmoothness; + maxN = exactFactoring ? n : maxN; + k = noKSmoothness ? n : k; - return gaIFactorizeNextPow2(n, fl); - }else if(maxN == 0){ - /** - * We are asked for a strict factoring. - */ - maxN = n; - } + /** + * Try optimal k-smooth optimizers. + */ + + if (k <= 2){gaIFactorize2Smooth(n, fl);} + else if(k <= 4){gaIFactorize3Smooth(n, fl);} + else {gaIFactorize5Smooth(n, fl);} + greaterThanMaxN = finiteSlack && (gaIFLIsOverflowed(fl) || + gaIFLGetProduct (fl) > maxN); + if(greaterThanMaxN){ + if(kSmoothness && k<=6){ + /** + * We've *proven* there exists no k-smooth n <= maxN, k <= 6. + * No use wasting more time here. + */ + + return 0; + } - if(k == 0 || k >= n){ + /* Otherwise fall-through to factorizer. */ + }else{ /** - * We want no k-smoothness constraint. + * Either the slack was infinite, or the product did not overflow and + * was <= maxN. The k-smoothness criterion is guaranteed by the + * factorizer we chose earlier. + * + * Therefore we have a satisfactory, optimal 2-, 3- or 5-smooth + * factorization (although not necessarily an exact one unless it is + * the case that maxN == n). We return it. */ - k = n; + return 1; } /** * Master loop. + * + * We arrive here with finite slack and all optimal 2-, 3- and 5-smooth + * factorizers unable to produce a factorization whose product is less + * than or equal to maxN. */ for(i=n; i <= maxN; i++){ @@ -463,7 +491,7 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl * * 2) We have a smoothness constraint and x>k. * - * In this case we have to increment x and begin anew the + * In this case we have to inc/decrement x and begin anew the * sub-factorization. This may cause us to fail out of factorizing * the current i, by exceeding our slack limit. If this happens we * abort the factorization rooted at i and move to the next i. @@ -471,12 +499,17 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl primetest: if(x==1 || gaIIsPrime(x)){ - if(x<=k){ + if(x <= k){ gaIFLAddFactors(fl, x, 1); return 1; }else{ - p = gaIFLGetProduct(fl); - if((maxN - p*x) < p){/* Overflow-free check maxN >= p*(x+1) */ + p = gaIFLGetProduct(fl); + newX = n/p; + newX += newX*p < n; + if(newX < x){ + x = newX; + goto subfactorize; + }else if((maxN - p*x) < p){/* Overflow-free check maxN >= p*(x+1) */ goto nextI; }else{ x++; @@ -516,7 +549,7 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl return 0; } -static int gaIFactorizeNextPow2(uint64_t n, ga_factor_list* fl){ +static int gaIFactorize2Smooth(uint64_t n, ga_factor_list* fl){ n--; n |= n >> 1; n |= n >> 2; @@ -532,11 +565,211 @@ static int gaIFactorizeNextPow2(uint64_t n, ga_factor_list* fl){ return 1; } +static int gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl){ + uint64_t nBest=-1, i3Best=0, i3, p3, nCurr; + int nlz = gaIClz(n), isBest2to64 = 1; + + /** + * Iterate over all powers of 3, scaling them by the least power-of-2 such + * that the result is greater than or equal to n. Report the smallest nBest + * so obtained. + */ + + for(i3=0, p3=1;i3<=40;i3++, p3*=3){ + nCurr = p3; + + /** + * If the current power of 3 is >= n, then this must be the last + * iteration, but perhaps a pure power of 3 is the best choice, so + * check for this. + */ + + if(nCurr >= n){ + if(isBest2to64 || nBest >= nCurr){ + isBest2to64 = 0; + nBest = nCurr; + i3Best = i3; + } + break; + } + + /** + * Otherwise we have a pure power of 3, p3, less than n, and must + * derive the least power of 2 such that p3 multiplied by that power of + * 2 is greater than or equal to n. We then compute the product of + * both. + */ + + nCurr <<= gaIClz(nCurr) - nlz; + if(nCurr= n. But is it the best factorization + * so far? + */ + + if(isBest2to64 || nBest >= nCurr){ + isBest2to64 = 0; + nBest = nCurr; + i3Best = i3; + + if(nCurr == n){ + break; + } + } + } + + + /** + * Return the smallest n found above. + * + * nBest and i3Best must be set. + */ + + gaIFLInit(fl); + if(isBest2to64){ + gaIFLAddFactors(fl, 2, 64); + }else{ + gaIFLAddFactors(fl, 2, gaICtz(nBest)); + gaIFLAddFactors(fl, 3, i3Best); + } + return 1; +} + +static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl){ + uint64_t nBest=-1, i3Best=0, i3, p3, i5Best=0, i5, p5, nCurr; + int nlz = gaIClz(n), isBest2to64 = 1; + + /** + * Iterate over all products of powers of 5 and 3, scaling them by the + * least power-of-2 such that the result is greater than or equal to n. + * Report the smallest nBest so obtained. + */ + + for(i5=0, p5=1;i5<=27;i5++, p5*=5){ + nCurr = p5; + + /** + * If the current power of 5 is >= n, then this must be the last + * iteration, but perhaps a pure power of 5 is the best choice, so + * check for this. + */ + + if(nCurr >= n){ + if(isBest2to64 || nBest >= nCurr){ + isBest2to64 = 0; + nBest = nCurr; + i3Best = 0; + i5Best = i5; + } + break; + } + + for(i3=0, p3=1;i3<=40;i3++, p3*=3){ + nCurr = p3*p5; + + /** + * If the current product of powers of 3 and 5 is >= n, then this + * must be the last iteration, but perhaps a pure power of 3 is the + * best choice, so check for this. + */ + + if(nCurr >= n){ + if(isBest2to64 || nBest >= nCurr){ + isBest2to64 = 0; + nBest = nCurr; + i3Best = i3; + i5Best = i5; + } + break; + } + + /** + * Otherwise we have a number nCurr, composed purely of factors 3 + * and 5, that is less than n. We must derive the least power of 2 + * such that nCurr multiplied by that power of 2 is greater than or + * equal to n. We then compute the product of both. + */ + + nCurr <<= gaIClz(nCurr) - nlz; + if(nCurr= n. But is it the best factorization + * so far? + */ + + if(isBest2to64 || nBest >= nCurr){ + isBest2to64 = 0; + nBest = nCurr; + i3Best = i3; + i5Best = i5; + + if(nCurr == n){ + goto exit; + } + } + } + } + + + /** + * Return the smallest n found above. + * + * nBest and i3Best must be set. + */ + + exit: + gaIFLInit(fl); + if(isBest2to64){ + gaIFLAddFactors(fl, 2, 64); + }else{ + gaIFLAddFactors(fl, 2, gaICtz(nBest)); + gaIFLAddFactors(fl, 3, i3Best); + gaIFLAddFactors(fl, 5, i5Best); + } + return 1; +} + void gaIFLInit(ga_factor_list* fl){ memset(fl, 0, sizeof(*fl)); } -int gaIFLFull(ga_factor_list* fl){ +int gaIFLFull(const ga_factor_list* fl){ return fl->d >= 15;/* Strictly speaking, fl->d never exceeds 15. */ } @@ -614,7 +847,7 @@ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p){ } } -int gaIFLGetFactorPower(ga_factor_list* fl, uint64_t f){ +int gaIFLGetFactorPower(const ga_factor_list* fl, uint64_t f){ int i; for(i=0;id;i++){ @@ -639,6 +872,29 @@ uint64_t gaIFLGetProduct(const ga_factor_list* fl){ return p; } +int gaIFLIsOverflowed(const ga_factor_list* fl){ + uint64_t p = 1, MAX=-1; + int i, j; + + if(gaIFLGetFactorPower(fl, 0) >= 1){ + return 0; + } + if(gaIFLGetFactorPower(fl, 2) >= 64){ + return 1; + } + + for(i=0;id;i++){ + for(j=0;jp[i];j++){ + if(MAX/p < fl->f[i]){ + return 1; + } + p *= fl->f[i]; + } + } + + return 0; +} + uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl){ return fl->d ? fl->f[fl->d-1] : 1; } diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index 7d2339c6bb..8d5f3dce38 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -91,16 +91,18 @@ int gaIIsPrime(uint64_t n); * The advantage of offering some slack to the factorizer is that in return, * the factorizer may succeed in outputting a factorization with smaller * factors. The maxN slack parameter must be 0 or be greater than or equal to - * n, but it is useless to set it beyond twice the value of n. + * n, but it is completely useless to set it beyond 2n. * - * When maxN is equal to -1 (2^64 - 1), or is greater than or equal to 2n, - * there is a guarantee that there exists a power of two that lies between n - * and 2n. Since this factorization involves only powers of the smallest prime - * (2), it is a valid factorization under any valid k-smoothness constraint, - * and so will be returned. + * When maxN is equal to -1 (2^64 - 1), or is greater than or equal to 2n, no + * upper limit is placed on the output factor list's product, but this + * implementation guarantees its product will not exceed 2n. This is because + * there always exists a power of two that lies between n and 2n, and since + * this factorization involves only powers of the smallest prime (2), it is a + * valid factorization under any valid k-smoothness constraint, and so may be + * returned. * - * When maxN is equal to 0 or n (no increase in value allowed), this implies - * that an exact factoring is requested. + * When maxN is equal to 0 (no increase in value allowed), an exact factoring + * is requested. * * The factorization can also be constrained by a (k)-smoothness constraint. * A k-smooth number n has no prime factors greater than k. If the factorizer @@ -147,7 +149,7 @@ void gaIFLInit(ga_factor_list* fl); * and non-zero otherwise. */ -int gaIFLFull(ga_factor_list* fl); +int gaIFLFull(const ga_factor_list* fl); /** * @brief Add a factor f with power p to the factor list. @@ -170,14 +172,23 @@ int gaIFLAddFactors(ga_factor_list* fl, uint64_t f, int p); * factorization. If it does not occur, return 0. */ -int gaIFLGetFactorPower(ga_factor_list* fl, uint64_t f); +int gaIFLGetFactorPower(const ga_factor_list* fl, uint64_t f); /** * @brief Compute the product of the factors stored in the factors list. + * + * NB: This function may return an overflowed result. To detect if it will, + * please call gaIFLIsOverflowed(fl). */ uint64_t gaIFLGetProduct(const ga_factor_list* fl); +/** + * @brief Check whether the factor list produces a number >= 2^64. + */ + +int gaIFLIsOverflowed(const ga_factor_list* fl); + /** * @brief Get the greatest factor in the factors list. */ diff --git a/tests/check_util_integerfactoring.c b/tests/check_util_integerfactoring.c index 5205107081..7aa5afa966 100644 --- a/tests/check_util_integerfactoring.c +++ b/tests/check_util_integerfactoring.c @@ -8,26 +8,118 @@ #include "util/integerfactoring.h" +/** + * Primality Checker + */ + +START_TEST(test_primalitychecker){ + /* Tiny numbers */ + ck_assert(!gaIIsPrime( 0ULL)); + ck_assert(!gaIIsPrime( 1ULL)); + ck_assert( gaIIsPrime( 2ULL)); + ck_assert( gaIIsPrime( 3ULL)); + ck_assert(!gaIIsPrime( 4ULL)); + ck_assert( gaIIsPrime( 5ULL)); + ck_assert(!gaIIsPrime( 6ULL)); + ck_assert( gaIIsPrime( 7ULL)); + ck_assert(!gaIIsPrime( 8ULL)); + ck_assert(!gaIIsPrime( 9ULL)); + ck_assert(!gaIIsPrime( 10ULL)); + ck_assert( gaIIsPrime( 11ULL)); + ck_assert(!gaIIsPrime( 12ULL)); + ck_assert( gaIIsPrime( 13ULL)); + ck_assert(!gaIIsPrime( 14ULL)); + ck_assert(!gaIIsPrime( 15ULL)); + ck_assert(!gaIIsPrime( 16ULL)); + ck_assert( gaIIsPrime( 17ULL)); + ck_assert(!gaIIsPrime( 18ULL)); + ck_assert( gaIIsPrime( 19ULL)); + ck_assert(!gaIIsPrime( 20ULL)); + /* Small primes */ + ck_assert( gaIIsPrime( 4987ULL)); + ck_assert( gaIIsPrime( 4993ULL)); + ck_assert( gaIIsPrime( 4999ULL)); + /* Squares of primes */ + ck_assert(!gaIIsPrime( 24870169ULL)); + ck_assert(!gaIIsPrime( 24930049ULL)); + ck_assert(!gaIIsPrime( 24990001ULL)); + /* Catalan pseudoprimes */ + ck_assert(!gaIIsPrime( 5907ULL)); + ck_assert(!gaIIsPrime( 1194649ULL)); + ck_assert(!gaIIsPrime( 12327121ULL)); + /* Fermat base-2 pseudoprimes */ + ck_assert(!gaIIsPrime( 341ULL)); + ck_assert(!gaIIsPrime( 561ULL)); + ck_assert(!gaIIsPrime( 645ULL)); + ck_assert(!gaIIsPrime( 1105ULL)); + ck_assert(!gaIIsPrime( 1387ULL)); + ck_assert(!gaIIsPrime( 1729ULL)); + ck_assert(!gaIIsPrime( 1905ULL)); + ck_assert(!gaIIsPrime( 2047ULL)); + ck_assert(!gaIIsPrime( 2465ULL)); + /* Strong Lucas pseudoprimes */ + ck_assert(!gaIIsPrime( 5459ULL)); + ck_assert(!gaIIsPrime( 5459ULL)); + ck_assert(!gaIIsPrime( 5459ULL)); + ck_assert(!gaIIsPrime( 5777ULL)); + ck_assert(!gaIIsPrime( 10877ULL)); + ck_assert(!gaIIsPrime( 16109ULL)); + ck_assert(!gaIIsPrime( 18971ULL)); + ck_assert(!gaIIsPrime( 22499ULL)); + ck_assert(!gaIIsPrime( 24569ULL)); + ck_assert(!gaIIsPrime( 25199ULL)); + ck_assert(!gaIIsPrime( 40309ULL)); + ck_assert(!gaIIsPrime( 58519ULL)); + ck_assert(!gaIIsPrime( 75077ULL)); + ck_assert(!gaIIsPrime( 97439ULL)); + ck_assert(!gaIIsPrime( 100127ULL)); + ck_assert(!gaIIsPrime( 113573ULL)); + ck_assert(!gaIIsPrime( 115639ULL)); + ck_assert(!gaIIsPrime( 130139ULL)); + /* Medium, prime. */ + ck_assert( gaIIsPrime( 2100000011ULL)); + ck_assert( gaIIsPrime( 2100000017ULL)); + /* Large, non-smooth, composite */ + ck_assert(!gaIIsPrime( 2196095973992233039ULL)); + /* Largest prime < 2**64: */ + ck_assert( gaIIsPrime(18446744073709551557ULL)); + /* Largest integers */ + ck_assert(!gaIIsPrime(18446744073709551613ULL)); + ck_assert(!gaIIsPrime(18446744073709551614ULL)); + ck_assert(!gaIIsPrime(18446744073709551615ULL)); +}END_TEST + /** * Integer Factorization test */ START_TEST(test_integerfactorization){ ga_factor_list fl; + uint64_t n; /** * Attempt exact factorization for 2^64-1, no k-smoothness constraint. * Expected PASS with 3*5*17*257*641*65537*6700417 */ - ck_assert_int_ne(gaIFactorize(18446744073709551615ULL, 0, 0, &fl), 0); + n = 18446744073709551615ULL; + ck_assert_int_ne (gaIFactorize(n, 0, 0, &fl), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 17ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 257ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 641ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 65537ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 6700417ULL), 1); + ck_assert_uint_eq(gaIFLGetProduct(&fl), n); /** * Attempt exact factorization for 2^64-1, 4096-smooth constraint. * Expected FAIL, because 2^64-1 possesses prime factors in excess of 4096. */ - ck_assert_int_eq(gaIFactorize(18446744073709551615ULL, 0, 4096, &fl), 0); + n = 18446744073709551615ULL; + ck_assert_int_eq (gaIFactorize(n, 0, 4096, &fl), 0); /** * Attempt approximate factorization for 2^64-1, no k-smoothness constraint. @@ -35,7 +127,11 @@ START_TEST(test_integerfactorization){ * Expected PASS, since 2^64-1 rounds up to 2^64 and 2^64 trivially factorizes. */ - ck_assert_int_ne(gaIFactorize(18446744073709551615ULL, -1, 0, &fl), 0); + n = 18446744073709551615ULL; + ck_assert_int_ne (gaIFactorize(n, -1, 0, &fl), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 64); + ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2); + ck_assert_int_ne (gaIFLIsOverflowed(&fl), 0); /** * Attempt exact factorization for 2196095973992233039, no k-smoothness constraint. @@ -44,18 +140,101 @@ START_TEST(test_integerfactorization){ * Expected PASS *very quickly*, since it factorizes as 1299817*1299821*1299827 */ - ck_assert_int_ne(gaIFactorize( 2196095973992233039ULL, 0, 0, &fl), 0); + n = 2196095973992233039ULL; + ck_assert_int_ne (gaIFactorize(n, 0, 0, &fl), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299817ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299821ULL), 1); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299827ULL), 1); + ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 1299827); + ck_assert_uint_eq(gaIFLGetProduct(&fl), n); /** - * Attempt approximate factorization for 2196095973992233039, 64-smooth constraint. + * Attempt approximate factorization for 2196095973992233039, 16-smooth constraint. * 2196095973992233039 is a large, highly non-smooth number, with three enormous * factors. It is not 64-smooth, so code paths that attempt approximate - * factorization within the growth limits (1%) are exercised. + * factorization within the growth limits (.005%) are exercised. * * Expected PASS *relatively quickly*. */ - ck_assert_int_ne(gaIFactorize( 2196095973992233039ULL, 2196095973992233039ULL*1.01, 64, &fl), 0); + n = 2196095973992233039ULL; + ck_assert_int_ne (gaIFactorize(n, n*1.00005, 16, &fl), 0); + ck_assert_uint_ge(gaIFLGetProduct(&fl), n); + ck_assert_uint_le(gaIFLGetProduct(&fl), n*1.00005); + + /** + * Attempt exact factorization of 7438473388800000000, 5-smooth constraint. + * It is a large, 5-smooth number. This should exercise the 5-smooth + * factorization path. + */ + + n = 7438473388800000000ULL; + ck_assert_int_ne (gaIFactorize(n, 0, 5, &fl), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 14); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 19); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 8); + ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5); + ck_assert_uint_eq(gaIFLGetProduct(&fl), n); + + /** + * Attempt approximate factorization of 7438473388799999997, 2-smooth constraint. + * It is a large, non-smooth number. This should exercise the optimal 2-smooth + * factorizer in spite of the available, unlimited slack. + */ + + n = 7438473388799999997ULL; + ck_assert_int_ne (gaIFactorize(n, -1, 2, &fl), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 63); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 0); + ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2); + ck_assert_uint_eq(gaIFLGetProduct(&fl), 9223372036854775808ULL); + + /** + * Attempt approximate factorization of 7438473388799999997, 3-smooth constraint. + * It is a large, non-smooth number. This should exercise the optimal 3-smooth + * factorizer in spite of the available, unlimited slack. + */ + + n = 7438473388799999997ULL; + ck_assert_int_ne (gaIFactorize(n, -1, 3, &fl), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 31); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 20); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 0); + ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 3); + ck_assert_uint_eq(gaIFLGetProduct(&fl), 7487812485248974848ULL); + + /** + * Attempt approximate factorization of 7438473388799999997, 5-smooth constraint. + * It is a large, non-smooth number, but 3 integers above it is a 5-smooth + * integer, 7438473388800000000. This should exercise the optimal 5-smooth + * factorizer in spite of the available, unlimited slack. + */ + + n = 7438473388799999997ULL; + ck_assert_int_ne (gaIFactorize(n, -1, 5, &fl), 0); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 14); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 19); + ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 8); + ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5); + ck_assert_uint_eq(gaIFLGetProduct(&fl), 7438473388800000000ULL); + + /** + * Toughest challenge: Attempt very tight approximate factorization of + * 9876543210987654321 with .01% slack and 43-smooth constraint. + * + * This forces a bypass of the optimal 5-smooth factorizers and heavily + * exercises the nextI:, subfactorize:, primetest: and newX jumps and + * calculations. + * + * Expected PASS, "reasonably fast". + */ + + n = 9876543210987654321ULL; + ck_assert_int_ne (gaIFactorize(n, n*1.0001, 43, &fl), 0); + ck_assert_uint_ge(gaIFLGetProduct(&fl), n); + ck_assert_uint_le(gaIFLGetProduct(&fl), n*1.0001); + ck_assert_uint_le(gaIFLGetGreatestFactor(&fl), 43); }END_TEST START_TEST(test_scheduler){ @@ -278,6 +457,9 @@ Suite *get_suite(void){ Suite *s = suite_create("util_integerfactoring"); TCase *tc = tcase_create("All"); + tcase_set_timeout(tc, 10.0); + + tcase_add_test(tc, test_primalitychecker); tcase_add_test(tc, test_integerfactorization); tcase_add_test(tc, test_scheduler); From a2eebbf4473ec238feaf6539b44623e1b31d33de Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 13 Oct 2016 05:25:44 -0400 Subject: [PATCH 044/597] Switched to BPSW primality checker + Bugfixes. - The Miller-Rabin primality checker using witnesses 2,3,5,7,11,...,37 has been replaced with the >2x faster (on average) BPSW primality checker, which uses Miller-Rabin with base 2 and a Lucas test. This considerably decreases the worst-case time bound. - An error in the inline assembly constraints for x86-64 has been fixed that led to a miscompilation in Release mode. - Added a large, non-smooth, strong base-2 Fermat pseudoprime. --- src/util/integerfactoring.c | 565 ++++++++++++++++++++++------ src/util/integerfactoring.h | 16 +- tests/check_util_integerfactoring.c | 1 + 3 files changed, 467 insertions(+), 115 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 340ec25ea3..b453da463f 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -12,6 +12,10 @@ #endif +/* Defines */ +#define GA_IS_COMPOSITE 0 +#define GA_IS_PRIME 1 +#define GA_IS_PROBABLY_PRIME 2 /** @@ -36,6 +40,42 @@ static int gaICtz(uint64_t n); static int gaIClz(uint64_t n); +/** + * @brief Integer Modular Addition. + * + * Computes + * + * $$a+b \pmod m$$ + * + * efficiently for 64-bit unsigned integers a, b, m. + */ + +static uint64_t gaIAddMod (uint64_t a, uint64_t b, uint64_t m); + +/** + * @brief Integer Modular Subtraction. + * + * Computes + * + * $$a-b \pmod m$$ + * + * efficiently for 64-bit unsigned integers a, b, m. + */ + +static uint64_t gaISubMod (uint64_t a, uint64_t b, uint64_t m); + +/** + * @brief Integer Modular Average. + * + * Computes + * + * $$\frac{a+b}{2} \pmod m$$ + * + * efficiently for 64-bit unsigned integers a, b, m. + */ + +static uint64_t gaIAvgMod (uint64_t a, uint64_t b, uint64_t m); + /** * @brief Integer Modular Multiplication. * @@ -60,6 +100,40 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m); static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); +/** + * @brief Jacobi Symbol + * + * Computes the Jacobi symbol, notated + * + * $$(a/n)$$ + * + * efficiently for 64-bit unsigned integers a, n. + */ + +static int gaIJacobiSymbol(uint64_t a, uint64_t n); + +/** + * @brief Strong Fermat base-a probable prime test. + * + * @param [in] n An odd integer >= 3. + * @param [in] a A witness integer > 0. + * @return Non-zero if n is a strong probable prime to base a and zero if n is + * composite. + */ + +static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a); + +/** + * @brief Strong Lucas probable prime test. + * + * The function uses Selfridge's Method A for selecting D,P,Q. + * + * @param [in] n An odd integer >= 3. + * @return Non-zero if n is a strong probable prime and zero if n is composite. + */ + +static int gaIIsPrimeStrongLucas(uint64_t n); + /** * @brief Round up positive n to next 2-, 3- or 5-smooth number and report its * factorization. @@ -134,6 +208,38 @@ static int gaIClz (uint64_t n){ #endif } +static uint64_t gaIAddMod (uint64_t a, uint64_t b, uint64_t m){ + a %= m; + b %= m; + + if(m-a > b){ + return a+b; + }else{ + return a+b-m; + } +} + +static uint64_t gaISubMod (uint64_t a, uint64_t b, uint64_t m){ + a %= m; + b %= m; + + if(a >= b){ + return a-b; + }else{ + return a-b+m; + } +} + +static uint64_t gaIAvgMod (uint64_t a, uint64_t b, uint64_t m){ + uint64_t s = gaIAddMod(a,b,m); + + if(s&1){ + return (s>>1)+(m>>1)+(s&m&1); + }else{ + return s>>1; + } +} + static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ #if (__GNUC__ >= 4) && defined(__x86_64__) uint64_t r; @@ -141,8 +247,8 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ asm( "mul %2\n\t" "div %3\n\t" - : "=&d"(r) /* Outputs */ - : "a"(a), "r"(b), "r"(m) /* Inputs */ + : "=&d"(r), "+a"(a) /* Outputs */ + : "r"(b), "r"(m) /* Inputs */ : "cc" ); @@ -267,25 +373,254 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ return r; } +static int gaIJacobiSymbol(uint64_t a, uint64_t n){ + int s=0; + uint64_t e, a1, n1; + + a %= n; + + if(a == 1 || n == 1){ + return 1; + } + + if(a == 0){ + return 0; + } + + e = gaICtz(a); + a1 = a >> e; + + if(e%2 == 0){ + s = 1; + }else if(n%8 == 1 || n%8 == 7){ + s = 1; + }else if(n%8 == 3 || n%8 == 5){ + s = -1; + } + + if(n%4 == 3 && a1%4 == 3){ + s = -s; + } + + n1 = n%a1; + return s*gaIJacobiSymbol(n1,a1); +} + +static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a){ + /** + * The Fermat strong probable prime test the Miller-Rabin test relies upon + * uses integer "witnesses" in an attempt at proving the number composite. + * Should it fail to prove an integer composite, it reports the number as + * "probably prime". However, if the witnesses are chosen carefully, the + * Miller-Rabin test can be made deterministic below a chosen threshold. + * + * One can use the primes 2 to 37 in order to ensure the correctness of the + * identifications for integers under 2^64. + * + * Jim Sinclair has found that the seven witnesses + * 2, 325, 9375, 28178, 450775, 9780504, 1795265022 + * also deterministically classify all integers <2^64. + * + * + * The Fermat strong probable prime test states that, for integers + * n = d*2^s+1, d odd, s integer >= 0 + * a integer (chosen witness) + * n is a Fermat strong probable prime if + * a^(d ) = 1 mod n or + * a^(d*2^r) = -1 mod n for any integer r, 0 <= r < s. + * + * + * The justification for this comes from Fermat's Little Theorem: If n is + * prime and a is any integer, then the following always holds: + * a^n = a mod n + * If n is prime and a is coprime to n, then the following always holds: + * a^(n-1) = 1 mod n + * + * + * In effect, the logic goes + * + * A: The number n is prime. (Statement) + * B: The number n does not divide a. (Statement) + * C: a^( n-1) = 1 mod n (Statement) + * D: The commutative ring Z/nZ is a finite field. (Statement) + * E: Finite fields are unique factorization domains. (Statement) + * F: x^2 = 1 mod n factorizes as (x+1)(x-1) = 0 mod n. (Statement) + * G: x^2 mod n only has the trivial square roots 1 and -1 (Statement) + * H: The number n is odd and >= 3. (Statement) + * I: The number n-1 equals d*2^s, with d,s int > 0, d odd. (Statement) + * J: a^( d) = 1 mod n (Statement) + * K: a^(d*2^r) = -1 mod n for some 0 <= r < s. (Statement) + * L: a^(d*2^(r+1)) = 1 mod n for some 0 <= r < s. (Statement) + * M: a^(d*2^r) != +-1 mod n AND (Statement) + * a^(d*2^(r+1)) = 1 mod n for some 0 <= r < s. + * + * A&B --> C (Proposition: Fermat's Little Theorem) + * !C --> !(A&B) = !A|!B (Contrapositive: Fermat's Little Theorem) + * A <-> D (Proposition) + * E (Proposition: By definition) + * F (Proposition: x^2-x+x-1 = x^2-1 mod n) + * D&E&F --> G (Proposition: (x+1)(x-1) is the only + * factorization) + * !G --> !D|!E|!F (Contrapositive: See above) + * H&I&J --> C (Proposition: Squaring 1 gives 1) + * H&I&K --> L (Proposition: Squaring -1 gives 1) + * H&I&L --> C (Proposition: 1, squared or not, gives 1) + * H&I&K --> C (Hypothetical Syllogism) + * H&I&(J|K) --> C (Union) + * H&I&!(J|K) --> M|!C (Proposition: Either squaring + * a^(d*2^(s-1)) != +-1 mod n + * gives a 1, in which case + * M holds, or it does not + * give 1 and therefore + * a^(n-1) != 1 mod n) + * and thus !C holds. + * H&I&!(J|K) --> H&I&M | !A | !B (Absorbtion, Hypothetical Syllogism) + * H&I&M --> !G (Proposition: x^2 = 1 mod n but x!=+1, + * so x^2 - 1 has roots + * other than +-1) + * H&I&M --> !D|!E|!F (Modus Tollens) + * H&I&M --> !D (Disjunctive Syllogism) + * H&I&M --> !A (Biconditional) + * H&I&!(J|K) --> !A | !A | !B (Hypothethical Syllogism) + * H&I&!(J|K)&B --> !A | !A (Absorbtion) + * H&I&!(J|K)&B --> !A | !A (Disjunctive Syllogism) + * H&I&!(J|K)&B --> !A (Disjunctive Simplification) + * ***** Conclusions: ***** + * H&I&M --> !A + * H&I&!(J|K)&B --> !A + * + * Broadly speaking, what the above tells us is: + * - We can't prove n prime (A), but we can prove it composite (!A). + * - Either H&I&M or H&I&!(J|K)&B prove compositeness. + * - If H&I&(J|K) for any r, then we've proven C true. If we prove C true, + * we can't use the contrapositive of Fermat's Little Theorem, so no + * conclusions about the truth-value of A can be made. The test is + * inconclusive. Thus this function returns "probably prime". + */ + + uint64_t d, x; + int64_t s, r; + + a %= n; + if(a==0){ + return GA_IS_PROBABLY_PRIME; + } + + s = gaICtz(n-1); + d = (n-1) >> s; + x = gaIPowMod(a,d,n); + + if(x==1 || x==n-1){ + return GA_IS_PROBABLY_PRIME; + } + + for(r=0;r=0;i--){ + Ut = gaIMulMod(U,V,n); + Vt = gaIAvgMod(gaIMulMod(V,V,n), gaIMulMod(D,gaIMulMod(U,U,n),n), n); + if((K>>i)&1){ + U = gaIAvgMod(Ut,Vt,n); + V = gaIAvgMod(Vt,gaIMulMod(D,Ut,n),n); + }else{ + U = Ut; + V = Vt; + } + } + + /** + * 7. If U0==0, then return "probably prime". Otherwise, return "composite". + */ + + return U==0 ? GA_IS_PROBABLY_PRIME : GA_IS_COMPOSITE; +} + int gaIIsPrime (uint64_t n){ - size_t i, j; int hasNoSmallFactors, hasSmallFactors; - uint64_t r, d; - const uint64_t WITNESSES[] = {2,3,5,7,11,13,17,19,23,29,31,37}; - const int NUMWITNESSES = sizeof(WITNESSES)/sizeof(WITNESSES[0]); - /** * Check if it is 2, the oddest prime. */ - if(n==2){return 1;} + if(n==2){return GA_IS_PRIME;} /** * Check if it is an even integer. */ - if((n&1) == 0){return 0;} + if((n&1) == 0){return GA_IS_COMPOSITE;} /** * For small integers, read directly the answer in a table. @@ -306,71 +641,35 @@ int gaIIsPrime (uint64_t n){ * Test small prime factors. */ - hasNoSmallFactors = n%3 && n%5 && n%7 && n%11 && n%13; + hasNoSmallFactors = n% 3 && n% 5 && n% 7 && n%11 && n%13 && n%17 && n%19 && + n%23 && n%29 && n%31 && n%37 && n%41 && n%43 && n%47 && + n%53 && n%59 && n%61 && n%67 && n%71 && n%73 && n%79; hasSmallFactors = !hasNoSmallFactors; if(hasSmallFactors){ - return 0; + return GA_IS_COMPOSITE; } /** - * Otherwise proceed to the Miller-Rabin test. + * We implement the Baillie-Pomerance-Selfridge-Wagstaff primality checker. + * 1) A Fermat base-2 strong probable prime that is also + * 2) A Lucas strong probable prime is + * 3) Prime. + * The BPSW test has no known failure cases and is proven to have no failures + * for all numbers under 2^64. It is expected to have failures (composites + * classified as "probably prime") but they are expected to be enormous. * - * The Miller-Rabin test uses integer "witnesses" in an attempt at - * proving the number composite. Should it fail to prove an integer - * composite, it reports the number as "probably prime". However, if - * the witnesses are chosen carefully, the Miller-Rabin test can be made - * deterministic below a chosen threshold. In our case, we use the primes - * 2 to 37 in order to ensure the correctness of the identifications for - * integers under 2^64. + * We begin with the Fermat base-2 strong primality test + * (Miller-Rabin test with one witness only, a=2). */ - r = gaICtz(n-1); - d = (n-1)>>r; - - /* For each witness... */ - for(i=0;i= n, then this must be the last * iteration, but perhaps a pure power of 3 is the best choice, so * check for this. */ - + if(nCurr >= n){ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; @@ -592,14 +891,14 @@ static int gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl){ } break; } - + /** * Otherwise we have a pure power of 3, p3, less than n, and must * derive the least power of 2 such that p3 multiplied by that power of * 2 is greater than or equal to n. We then compute the product of * both. */ - + nCurr <<= gaIClz(nCurr) - nlz; if(nCurr= n. But is it the best factorization * so far? */ - + if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; nBest = nCurr; i3Best = i3; - + if(nCurr == n){ break; } } } - - + + /** * Return the smallest n found above. - * + * * nBest and i3Best must be set. */ - + gaIFLInit(fl); if(isBest2to64){ gaIFLAddFactors(fl, 2, 64); @@ -656,22 +955,22 @@ static int gaIFactorize3Smooth(uint64_t n, ga_factor_list* fl){ static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl){ uint64_t nBest=-1, i3Best=0, i3, p3, i5Best=0, i5, p5, nCurr; int nlz = gaIClz(n), isBest2to64 = 1; - + /** * Iterate over all products of powers of 5 and 3, scaling them by the * least power-of-2 such that the result is greater than or equal to n. * Report the smallest nBest so obtained. */ - + for(i5=0, p5=1;i5<=27;i5++, p5*=5){ nCurr = p5; - + /** * If the current power of 5 is >= n, then this must be the last * iteration, but perhaps a pure power of 5 is the best choice, so * check for this. */ - + if(nCurr >= n){ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; @@ -681,16 +980,16 @@ static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl){ } break; } - + for(i3=0, p3=1;i3<=40;i3++, p3*=3){ nCurr = p3*p5; - + /** * If the current product of powers of 3 and 5 is >= n, then this * must be the last iteration, but perhaps a pure power of 3 is the * best choice, so check for this. */ - + if(nCurr >= n){ if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; @@ -700,14 +999,14 @@ static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl){ } break; } - + /** * Otherwise we have a number nCurr, composed purely of factors 3 * and 5, that is less than n. We must derive the least power of 2 * such that nCurr multiplied by that power of 2 is greater than or * equal to n. We then compute the product of both. */ - + nCurr <<= gaIClz(nCurr) - nlz; if(nCurr= n. But is it the best factorization * so far? */ - + if(isBest2to64 || nBest >= nCurr){ isBest2to64 = 0; nBest = nCurr; i3Best = i3; i5Best = i5; - + if(nCurr == n){ goto exit; } } } } - - + + /** * Return the smallest n found above. - * + * * nBest and i3Best must be set. */ - - exit: + + exit: gaIFLInit(fl); if(isBest2to64){ gaIFLAddFactors(fl, 2, 64); @@ -875,14 +1174,14 @@ uint64_t gaIFLGetProduct(const ga_factor_list* fl){ int gaIFLIsOverflowed(const ga_factor_list* fl){ uint64_t p = 1, MAX=-1; int i, j; - + if(gaIFLGetFactorPower(fl, 0) >= 1){ return 0; } if(gaIFLGetFactorPower(fl, 2) >= 64){ return 1; } - + for(i=0;id;i++){ for(j=0;jp[i];j++){ if(MAX/p < fl->f[i]){ @@ -891,7 +1190,7 @@ int gaIFLIsOverflowed(const ga_factor_list* fl){ p *= fl->f[i]; } } - + return 0; } @@ -954,22 +1253,60 @@ static uint64_t gaIFLGetSmallestFactorv(int n, const ga_factor_list* fl, int* id return hasFactors ? f : 1; } +int gaIFLsprintf(char* str, const ga_factor_list* fl){ + int i, j; + int total = 0; + char* ptr = str; + + /* Loop over all factors and spit them out. */ + for(i=0;id;i++){ + for(j=0;jp[i];j++){ + total += sprintf(ptr, "%llu*", (unsigned long long)fl->f[i]); + if(ptr){ + ptr += strlen(ptr); + } + } + } + + /* If no factors were printed, print 1. */ + if(total == 0){ + total += sprintf(ptr, "1*"); + if(ptr){ + ptr += strlen(ptr); + } + } + + /* Terminate buffer ('*' -> '\0') and deduct one character. */ + total--; + if(str){ + str[total] = '\0'; + } + + return total; +} + void gaIFLappend(strb *sb, const ga_factor_list* fl){ - int i, j; - /* Loop over all factors and spit them out. */ - for (i = 0; i < fl->d; i++) { - for (j = 0; j < fl->p[i]; j++) { - strb_appendf(sb, "%llu*", (unsigned long long)fl->f[i]); - } - } - - /* If no factors were printed, print 1. */ - if (i == 0 && j == 0) { - strb_appendf(sb, "1*"); - } - - /* Deduct final '*'. */ - sb->l -= 1; + int i, j; + int noFactorsPrinted = 1; + + /* Loop over all factors and spit them out. */ + for(i=0;id;i++){ + for(j=0;jp[i];j++){ + noFactorsPrinted = 0; + strb_appendf(sb, "%llu*", (unsigned long long)fl->f[i]); + } + } + + /** + * If no factors were printed, print 1. + * Otherwise, delete final '*'. + */ + + if(noFactorsPrinted){ + strb_appendf(sb, "1"); + }else{ + sb->s[--sb->l] = '\0'; + } } void gaIFLSchedule(const int n, diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index 8d5f3dce38..48329336ab 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -201,6 +201,20 @@ uint64_t gaIFLGetGreatestFactor(const ga_factor_list* fl); uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl); +/** + * @brief Print out the factor list in a human-readable form, sprintf()-style. + * + * @param [out] str A string into which to print out the factor list. If the + * factor list is a result of gaIFactorize(), then the + * maximum length of buffer required is 128 bytes. + * If str is NULL, nothing is printed. + * @param [in] fl The factor list to be printed. + * @return The number of characters that would have been printed + * out, assuming an unbounded, non-NULL buffer. + */ + +int gaIFLsprintf(char* str, const ga_factor_list* fl); + /** * @brief Print out the factor list in a human-readable form. * @@ -210,7 +224,7 @@ uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl); * @param [in] fl The factor list to be printed. */ -void gaIFLsnprintf(strb *sb, const ga_factor_list* fl); +void gaIFLappend(strb *sb, const ga_factor_list* fl); /** * @brief Schedule block size, grid size and what's left over that fits in diff --git a/tests/check_util_integerfactoring.c b/tests/check_util_integerfactoring.c index 7aa5afa966..ed86aff72a 100644 --- a/tests/check_util_integerfactoring.c +++ b/tests/check_util_integerfactoring.c @@ -57,6 +57,7 @@ START_TEST(test_primalitychecker){ ck_assert(!gaIIsPrime( 1905ULL)); ck_assert(!gaIIsPrime( 2047ULL)); ck_assert(!gaIIsPrime( 2465ULL)); + ck_assert(!gaIIsPrime( 486737ULL)); /* Strong Lucas pseudoprimes */ ck_assert(!gaIIsPrime( 5459ULL)); ck_assert(!gaIIsPrime( 5459ULL)); From db69d5f93a716f35fdef54e1e20b92f5bf4d6e35 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 13 Oct 2016 06:41:02 -0400 Subject: [PATCH 045/597] Added version of scheduler with integer arguments. --- src/util/integerfactoring.c | 87 ++++++++++++++++++++++++++++++++++++- src/util/integerfactoring.h | 27 ++++++++---- 2 files changed, 105 insertions(+), 9 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index b453da463f..0894dace9d 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -1309,6 +1309,91 @@ void gaIFLappend(strb *sb, const ga_factor_list* fl){ } } +void gaISchedule(const int n, + const uint64_t maxBtot, + const uint64_t* maxBind, + const uint64_t maxGtot, + const uint64_t* maxGind, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs){ + int i; + uint64_t kBS, kGS, k; + + /** + * Allocate a VLA or similar. + * + * C89 neither allows VLAs nor a check beforehand that n>0 to avoid UB + * (but malloc of 0 bytes is well-defined), so force VLA size to be >= 1 + * with a !n + n trick. + */ + +#if GA_USING_MALLOC_FOR_VLA + ga_factor_list* factBS = malloc(n * sizeof(*factBS)); + ga_factor_list* factGS = malloc(n * sizeof(*factGS)); + ga_factor_list* factCS = malloc(n * sizeof(*factCS)); +#else + ga_factor_list factBS[!n + n]; + ga_factor_list factGS[!n + n]; + ga_factor_list factCS[!n + n]; +#endif + + + if(n<=0){return;} + + + /** + * Factorize the provided integers under their k-smoothness constraint. + * Use the strictest of either the block or grid constraints on each + * dimension. + */ + + for(i=0;i Date: Thu, 13 Oct 2016 06:44:48 -0400 Subject: [PATCH 046/597] Clean whitespace in check_util_integerfactoring.c --- tests/check_util_integerfactoring.c | 124 ++++++++++++++-------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/tests/check_util_integerfactoring.c b/tests/check_util_integerfactoring.c index ed86aff72a..08d1b17869 100644 --- a/tests/check_util_integerfactoring.c +++ b/tests/check_util_integerfactoring.c @@ -97,12 +97,12 @@ START_TEST(test_primalitychecker){ START_TEST(test_integerfactorization){ ga_factor_list fl; uint64_t n; - + /** * Attempt exact factorization for 2^64-1, no k-smoothness constraint. * Expected PASS with 3*5*17*257*641*65537*6700417 */ - + n = 18446744073709551615ULL; ck_assert_int_ne (gaIFactorize(n, 0, 0, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 3ULL), 1); @@ -113,34 +113,34 @@ START_TEST(test_integerfactorization){ ck_assert_int_eq (gaIFLGetFactorPower(&fl, 65537ULL), 1); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 6700417ULL), 1); ck_assert_uint_eq(gaIFLGetProduct(&fl), n); - + /** * Attempt exact factorization for 2^64-1, 4096-smooth constraint. * Expected FAIL, because 2^64-1 possesses prime factors in excess of 4096. */ - + n = 18446744073709551615ULL; ck_assert_int_eq (gaIFactorize(n, 0, 4096, &fl), 0); - + /** * Attempt approximate factorization for 2^64-1, no k-smoothness constraint. * Unlimited growth permitted. * Expected PASS, since 2^64-1 rounds up to 2^64 and 2^64 trivially factorizes. */ - + n = 18446744073709551615ULL; ck_assert_int_ne (gaIFactorize(n, -1, 0, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 64); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2); ck_assert_int_ne (gaIFLIsOverflowed(&fl), 0); - + /** * Attempt exact factorization for 2196095973992233039, no k-smoothness constraint. * 2196095973992233039 is a large, highly non-smooth number, with three enormous * factors. * Expected PASS *very quickly*, since it factorizes as 1299817*1299821*1299827 */ - + n = 2196095973992233039ULL; ck_assert_int_ne (gaIFactorize(n, 0, 0, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299817ULL), 1); @@ -148,27 +148,27 @@ START_TEST(test_integerfactorization){ ck_assert_int_eq (gaIFLGetFactorPower(&fl, 1299827ULL), 1); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 1299827); ck_assert_uint_eq(gaIFLGetProduct(&fl), n); - + /** * Attempt approximate factorization for 2196095973992233039, 16-smooth constraint. * 2196095973992233039 is a large, highly non-smooth number, with three enormous * factors. It is not 64-smooth, so code paths that attempt approximate * factorization within the growth limits (.005%) are exercised. - * + * * Expected PASS *relatively quickly*. */ - + n = 2196095973992233039ULL; ck_assert_int_ne (gaIFactorize(n, n*1.00005, 16, &fl), 0); ck_assert_uint_ge(gaIFLGetProduct(&fl), n); ck_assert_uint_le(gaIFLGetProduct(&fl), n*1.00005); - + /** * Attempt exact factorization of 7438473388800000000, 5-smooth constraint. * It is a large, 5-smooth number. This should exercise the 5-smooth * factorization path. */ - + n = 7438473388800000000ULL; ck_assert_int_ne (gaIFactorize(n, 0, 5, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 14); @@ -176,13 +176,13 @@ START_TEST(test_integerfactorization){ ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 8); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5); ck_assert_uint_eq(gaIFLGetProduct(&fl), n); - + /** * Attempt approximate factorization of 7438473388799999997, 2-smooth constraint. * It is a large, non-smooth number. This should exercise the optimal 2-smooth * factorizer in spite of the available, unlimited slack. */ - + n = 7438473388799999997ULL; ck_assert_int_ne (gaIFactorize(n, -1, 2, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 63); @@ -190,13 +190,13 @@ START_TEST(test_integerfactorization){ ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 0); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 2); ck_assert_uint_eq(gaIFLGetProduct(&fl), 9223372036854775808ULL); - + /** * Attempt approximate factorization of 7438473388799999997, 3-smooth constraint. * It is a large, non-smooth number. This should exercise the optimal 3-smooth * factorizer in spite of the available, unlimited slack. */ - + n = 7438473388799999997ULL; ck_assert_int_ne (gaIFactorize(n, -1, 3, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 31); @@ -204,14 +204,14 @@ START_TEST(test_integerfactorization){ ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 0); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 3); ck_assert_uint_eq(gaIFLGetProduct(&fl), 7487812485248974848ULL); - + /** * Attempt approximate factorization of 7438473388799999997, 5-smooth constraint. * It is a large, non-smooth number, but 3 integers above it is a 5-smooth * integer, 7438473388800000000. This should exercise the optimal 5-smooth * factorizer in spite of the available, unlimited slack. */ - + n = 7438473388799999997ULL; ck_assert_int_ne (gaIFactorize(n, -1, 5, &fl), 0); ck_assert_int_eq (gaIFLGetFactorPower(&fl, 2ULL), 14); @@ -219,18 +219,18 @@ START_TEST(test_integerfactorization){ ck_assert_int_eq (gaIFLGetFactorPower(&fl, 5ULL), 8); ck_assert_uint_eq(gaIFLGetGreatestFactor(&fl), 5); ck_assert_uint_eq(gaIFLGetProduct(&fl), 7438473388800000000ULL); - + /** * Toughest challenge: Attempt very tight approximate factorization of * 9876543210987654321 with .01% slack and 43-smooth constraint. - * + * * This forces a bypass of the optimal 5-smooth factorizers and heavily * exercises the nextI:, subfactorize:, primetest: and newX jumps and * calculations. - * + * * Expected PASS, "reasonably fast". */ - + n = 9876543210987654321ULL; ck_assert_int_ne (gaIFactorize(n, n*1.0001, 43, &fl), 0); ck_assert_uint_ge(gaIFLGetProduct(&fl), n); @@ -243,36 +243,36 @@ START_TEST(test_scheduler){ uint64_t maxBTot = 1024, maxBInd[] = { 1024, 1024, 64}, maxGTot = 0xFFFFFFFF, maxGInd[] = {2147483647, 65535, 65535}, warpSize = 32; - + int warpAxis; uint64_t dims[3]; ga_factor_list factBS[3], factGS[3], factCS[3]; unsigned long long intbBS[3], intbGS[3], intbCS[3]; unsigned long long intaBS[3], intaGS[3], intaCS[3]; - + /** * NOTE: If you want to view befores-and-afters of scheduling, #define PRINT * to something non-0. */ #define PRINT 0 - + /** - * + * * Testcase: (895,1147,923) job, warpSize on axis 0. - * + * */ - + { warpAxis = 0; dims[0] = 895; dims[1] = 1141; dims[2] = 923; dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize; - + /** * Factorization job must be successful. */ - + ck_assert(gaIFactorize(warpAxis==0?warpSize:1, 0, maxBInd[0], factBS+0)); ck_assert(gaIFactorize(warpAxis==1?warpSize:1, 0, maxBInd[1], factBS+1)); ck_assert(gaIFactorize(warpAxis==2?warpSize:1, 0, maxBInd[2], factBS+2)); @@ -282,7 +282,7 @@ START_TEST(test_scheduler){ ck_assert(gaIFactorize( dims[0], dims[0]*1.1, maxBInd[0], factCS+0)); ck_assert(gaIFactorize( dims[1], dims[1]*1.1, maxBInd[1], factCS+1)); ck_assert(gaIFactorize( dims[2], dims[2]*1.1, maxBInd[2], factCS+2)); - + intbBS[0] = gaIFLGetProduct(factBS+0); intbBS[1] = gaIFLGetProduct(factBS+1); intbBS[2] = gaIFLGetProduct(factBS+2); @@ -292,20 +292,20 @@ START_TEST(test_scheduler){ intbCS[0] = gaIFLGetProduct(factCS+0); intbCS[1] = gaIFLGetProduct(factCS+1); intbCS[2] = gaIFLGetProduct(factCS+2); - + /** * Ensure that factorization only *increases* the size of the problem. */ - + ck_assert_uint_ge(intbCS[0], dims[0]); ck_assert_uint_ge(intbCS[1], dims[1]); ck_assert_uint_ge(intbCS[2], dims[2]); - - + + /** * Run scheduler. */ - + #if PRINT printf("Before:\n"); printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]); @@ -328,21 +328,21 @@ START_TEST(test_scheduler){ printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]); printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]); #endif - + /** * Scheduling is only about moving factors between block/grid/chunk factor * lists. Therefore, the three dimensions must not have changed size. */ - + ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]); ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]); ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]); - + /** * Verify that the individual limits and global limits on threads in a * block and blocks in a grid are met. */ - + ck_assert_uint_le(intaBS[0], maxBInd[0]); ck_assert_uint_le(intaBS[1], maxBInd[1]); ck_assert_uint_le(intaBS[2], maxBInd[2]); @@ -352,25 +352,25 @@ START_TEST(test_scheduler){ ck_assert_uint_le(intaBS[0]*intaBS[1]*intaBS[2], maxBTot); ck_assert_uint_le(intaGS[0]*intaGS[1]*intaGS[2], maxGTot); } - - + + /** - * + * * Testcase: (1,1,121632959) job, warpSize on axis 2. - * + * */ - + { warpAxis = 2; dims[0] = 1; dims[1] = 1; dims[2] = 121632959; dims[warpAxis] = (dims[warpAxis]+warpSize-1) / warpSize; - + /** * Factorization job must be successful. */ - + ck_assert(gaIFactorize(warpAxis==0?warpSize:1, 0, maxBInd[0], factBS+0)); ck_assert(gaIFactorize(warpAxis==1?warpSize:1, 0, maxBInd[1], factBS+1)); ck_assert(gaIFactorize(warpAxis==2?warpSize:1, 0, maxBInd[2], factBS+2)); @@ -380,7 +380,7 @@ START_TEST(test_scheduler){ ck_assert(gaIFactorize( dims[0], dims[0]*1.1, maxBInd[0], factCS+0)); ck_assert(gaIFactorize( dims[1], dims[1]*1.1, maxBInd[1], factCS+1)); ck_assert(gaIFactorize( dims[2], dims[2]*1.1, maxBInd[2], factCS+2)); - + intbBS[0] = gaIFLGetProduct(factBS+0); intbBS[1] = gaIFLGetProduct(factBS+1); intbBS[2] = gaIFLGetProduct(factBS+2); @@ -390,20 +390,20 @@ START_TEST(test_scheduler){ intbCS[0] = gaIFLGetProduct(factCS+0); intbCS[1] = gaIFLGetProduct(factCS+1); intbCS[2] = gaIFLGetProduct(factCS+2); - + /** * Ensure that factorization only *increases* the size of the problem. */ - + ck_assert_uint_ge(intbCS[0], dims[0]); ck_assert_uint_ge(intbCS[1], dims[1]); ck_assert_uint_ge(intbCS[2], dims[2]); - - + + /** * Run scheduler. */ - + #if PRINT printf("Before:\n"); printf("BS: (%6llu, %6llu, %6llu)\n", intbBS[0], intbBS[1], intbBS[2]); @@ -426,21 +426,21 @@ START_TEST(test_scheduler){ printf("GS: (%6llu, %6llu, %6llu)\n", intaGS[0], intaGS[1], intaGS[2]); printf("CS: (%6llu, %6llu, %6llu)\n", intaCS[0], intaCS[1], intaCS[2]); #endif - + /** * Scheduling is only about moving factors between block/grid/chunk factor * lists. Therefore, the three dimensions must not have changed size. */ - + ck_assert_uint_eq(intbBS[0]*intbGS[0]*intbCS[0], intaBS[0]*intaGS[0]*intaCS[0]); ck_assert_uint_eq(intbBS[1]*intbGS[1]*intbCS[1], intaBS[1]*intaGS[1]*intaCS[1]); ck_assert_uint_eq(intbBS[2]*intbGS[2]*intbCS[2], intaBS[2]*intaGS[2]*intaCS[2]); - + /** * Verify that the individual limits and global limits on threads in a * block and blocks in a grid are met. */ - + ck_assert_uint_le(intaBS[0], maxBInd[0]); ck_assert_uint_le(intaBS[1], maxBInd[1]); ck_assert_uint_le(intaBS[2], maxBInd[2]); @@ -457,15 +457,15 @@ START_TEST(test_scheduler){ Suite *get_suite(void){ Suite *s = suite_create("util_integerfactoring"); TCase *tc = tcase_create("All"); - + tcase_set_timeout(tc, 10.0); - + tcase_add_test(tc, test_primalitychecker); tcase_add_test(tc, test_integerfactorization); tcase_add_test(tc, test_scheduler); - + suite_add_tcase(s, tc); - + return s; } From 2ff4c527e398d8ec91d551f08af831188d280139 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 13 Oct 2016 06:45:40 -0400 Subject: [PATCH 047/597] Clean whitespace in check_reduction.c --- tests/check_reduction.c | 138 ++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/tests/check_reduction.c b/tests/check_reduction.c index c83e5b9772..106f6f3fc5 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -69,84 +69,84 @@ static double pcgRand01(void){ START_TEST(test_reduction){ pcgSeed(1); - + /** * We test here a reduction of some random 3D tensor on the first and * third dimensions. */ - + size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const unsigned reduxList[] = {0,2}; - + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMax = calloc(1, sizeof(*pMax) * dims[1] ); size_t* pArgmax = calloc(1, sizeof(*pArgmax) * dims[1] ); - + ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); - - + + /** * Initialize source data. */ - + for(i=0;i gtMax){ gtMax = v; gtArgmax = i*dims[2] + k; } } } - + ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } - + /** * Deallocate. */ - + free(pSrc); free(pMax); free(pArgmax); @@ -157,88 +157,88 @@ START_TEST(test_reduction){ START_TEST(test_idxtranspose){ pcgSeed(1); - + /** * We test here the same reduction as test_reduction, except with a * reversed reduxList {2,0} instead of {0,2}. That should lead to a * transposition of the argmax "coordinates" and thus a change in its * "flattened" output version. */ - + size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; size_t rdxDims[1] = {50}; size_t rdxProdDims = rdxDims[0]; const unsigned reduxList[] = {2,0}; - + float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); - + ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); - - + + /** * Initialize source data. */ - + for(i=0;i gtMax){ gtMax = v; gtArgmax = k*dims[0] + i; } } } - + ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } - + /** * Deallocate. */ - + free(pSrc); free(pMax); free(pArgmax); @@ -249,75 +249,75 @@ START_TEST(test_idxtranspose){ START_TEST(test_veryhighrank){ pcgSeed(1); - + /** * Here we test a reduction of a random 8D tensor on four dimensions. */ - + size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const unsigned reduxList[] = {2,4,7,5}; - + float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); - + ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); ck_assert_ptr_ne(pArgmax, NULL); - - + + /** * Initialize source data. */ - + for(i=0;i gtMax){ gtMax = v; gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; @@ -326,7 +326,7 @@ START_TEST(test_veryhighrank){ } } } - + size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; ck_assert_msg(gtMax == pMax[dstIdx], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!"); @@ -334,12 +334,12 @@ START_TEST(test_veryhighrank){ } } } - - + + /** * Deallocate. */ - + free(pSrc); free(pMax); free(pArgmax); @@ -353,11 +353,11 @@ Suite *get_suite(void) { TCase *tc = tcase_create("basic"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 15.0); - + tcase_add_test(tc, test_reduction); tcase_add_test(tc, test_idxtranspose); tcase_add_test(tc, test_veryhighrank); - + suite_add_tcase(s, tc); return s; } From 3d0580a99b6a090c8da1bf792c7cd088b884bb2e Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 13 Oct 2016 16:13:06 -0400 Subject: [PATCH 048/597] Fix memory leak on non-VLA-supporting systems when n<=0. --- src/util/integerfactoring.c | 65 +++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 0894dace9d..4e7c60c2e3 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -174,6 +174,19 @@ static void gaIFLScheduleOpt(const int n, const uint64_t maxTot, const uint64_t* maxInd); +/** + * @brief Schedule block/grid/chunk size, integer version, n checked >= 0. + */ + +static void gaIScheduleChecked(const int n, + const uint64_t maxBtot, + const uint64_t* maxBind, + const uint64_t maxGtot, + const uint64_t* maxGind, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs); + /** @@ -1309,37 +1322,35 @@ void gaIFLappend(strb *sb, const ga_factor_list* fl){ } } -void gaISchedule(const int n, - const uint64_t maxBtot, - const uint64_t* maxBind, - const uint64_t maxGtot, - const uint64_t* maxGind, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs){ +static void gaIScheduleChecked(const int n, + const uint64_t maxBtot, + const uint64_t* maxBind, + const uint64_t maxGtot, + const uint64_t* maxGind, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs){ int i; uint64_t kBS, kGS, k; /** * Allocate a VLA or similar. - * - * C89 neither allows VLAs nor a check beforehand that n>0 to avoid UB - * (but malloc of 0 bytes is well-defined), so force VLA size to be >= 1 - * with a !n + n trick. + * + * C89 neither allows VLAs nor a check beforehand that n>0 to avoid UB. The + * check for n>0 was thus done in our caller. */ - + #if GA_USING_MALLOC_FOR_VLA ga_factor_list* factBS = malloc(n * sizeof(*factBS)); ga_factor_list* factGS = malloc(n * sizeof(*factGS)); ga_factor_list* factCS = malloc(n * sizeof(*factCS)); #else - ga_factor_list factBS[!n + n]; - ga_factor_list factGS[!n + n]; - ga_factor_list factCS[!n + n]; + ga_factor_list factBS[n]; + ga_factor_list factGS[n]; + ga_factor_list factCS[n]; #endif - if(n<=0){return;} /** @@ -1394,6 +1405,26 @@ void gaISchedule(const int n, #endif } +void gaISchedule(const int n, + const uint64_t maxBtot, + const uint64_t* maxBind, + const uint64_t maxGtot, + const uint64_t* maxGind, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs){ + if(n<=0){return;} + + gaIScheduleChecked(n, + maxBtot, + maxBind, + maxGtot, + maxGind, + bs, + gs, + cs); +} + void gaIFLSchedule(const int n, const uint64_t maxBtot, const uint64_t* maxBind, From d0a01fc7d2558ddf502c0152ffb63c4aed6b99b8 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 13 Oct 2016 16:14:59 -0400 Subject: [PATCH 049/597] Silence warning about signed-unsigned integer comparison warning given by gcc -Wall -Wextra. --- src/util/integerfactoring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 4e7c60c2e3..b0e2b4628d 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -715,8 +715,8 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl * Magic-value arguments interpreted and canonicalized. */ - exactFactoring = (maxN == 0); - infiniteSlack = (maxN == -1); + exactFactoring = (maxN == (uint64_t) 0); + infiniteSlack = (maxN == (uint64_t)-1); noKSmoothness = (k == 0) || (k >= n); finiteSlack = !infiniteSlack; kSmoothness = !noKSmoothness; From 2b09821ea331ad509249506ecd1adb1898d2a3df Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 16 Oct 2016 03:21:49 -0400 Subject: [PATCH 050/597] Permit compilation under strict ANSI C mode. This requires testing the __STRICT_ANSI__ macro. --- src/util/integerfactoring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index b0e2b4628d..32c6cc50e0 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -254,7 +254,7 @@ static uint64_t gaIAvgMod (uint64_t a, uint64_t b, uint64_t m){ } static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ -#if (__GNUC__ >= 4) && defined(__x86_64__) +#if (__GNUC__ >= 4) && defined(__x86_64__) && !defined(__STRICT_ANSI__) uint64_t r; asm( From f0a4abd8c6d0fa09e1577273ba68506e26ab8c58 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 16 Oct 2016 04:47:41 -0400 Subject: [PATCH 051/597] Detect 2^64 overflow while computing 5-smooth factorization for very- high-value numbers. The numbers for which overflow would occur are those >= 0.2*2^64. --- src/util/integerfactoring.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index 32c6cc50e0..fdd7c76875 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -995,12 +995,22 @@ static int gaIFactorize5Smooth(uint64_t n, ga_factor_list* fl){ } for(i3=0, p3=1;i3<=40;i3++, p3*=3){ + /** + * Detect when the product p3*p5 would overflow 2^64. + */ + + if(i3){ + nCurr = (p3/3)*p5; + if(nCurr+nCurr < nCurr || nCurr+nCurr+nCurr < nCurr+nCurr){ + break; + } + } nCurr = p3*p5; /** * If the current product of powers of 3 and 5 is >= n, then this - * must be the last iteration, but perhaps a pure power of 3 is the - * best choice, so check for this. + * must be the last iteration, but perhaps a pure product of powers + * of 3 and 5 is the best choice, so check for this. */ if(nCurr >= n){ From f13cb65acaac1a970ca1c1a442497d3828bdd38f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 3 Nov 2016 16:13:50 -0400 Subject: [PATCH 052/597] Add some documentation and remove unused declaration. --- pygpu/gpuarray.pyx | 7 ++++++- src/private_cuda.h | 1 - 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 194cab35e0..99332007a0 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -789,7 +789,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None, :type shape: iterable of ints :param context: context of the gpudata :type context: GpuContext - :param strides: strides for the results + :param strides: strides for the results (C contiguous if not specified) :type strides: iterable of ints :param writable: is the data writable? :type writeable: bool @@ -1433,6 +1433,11 @@ cuda_open_ipc_handle = Date: Thu, 3 Nov 2016 19:32:07 -0400 Subject: [PATCH 053/597] Convert internally from unsigned to signed axis numbers. This avoids an infinite loop when counting ndX downto 0 with unsigned integers, as the condition `unsigned >= 0` always holds. --- src/gpuarray_reduction.c | 42 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 0c05e14397..9ca69d491e 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -27,27 +27,27 @@ struct maxandargmax_ctx{ GpuArray* dstMax; GpuArray* dstArgmax; const GpuArray* src; - unsigned reduxLen; - const unsigned* reduxList; + int reduxLen; + const int* reduxList; /* General. */ int ret; - unsigned* axisList; + int* axisList; gpucontext* gpuCtx; /* Source code Generator. */ const char* dstMaxType; const char* dstArgmaxType; - unsigned ndd; - unsigned ndr; - unsigned nds; - unsigned ndh; + int ndd; + int ndr; + int nds; + int ndh; strb s; char* sourceCode; GpuKernel kernel; /* Scheduler */ - unsigned hwAxisList[3]; + int hwAxisList[3]; size_t blockSize [3]; size_t gridSize [3]; size_t chunkSize [3]; @@ -64,8 +64,8 @@ typedef struct maxandargmax_ctx maxandargmax_ctx; /* Function prototypes */ -static int axisInSet (unsigned v, - const unsigned* set, +static int axisInSet (int v, + const int* set, size_t setLen, size_t* where); static void appendIdxes (strb* s, @@ -102,7 +102,8 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ - maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src, reduxLen, reduxList}, + maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src, + (int)reduxLen, (const int*)reduxList}, *ctx = &ctxSTACK; if(maxandargmaxCheckargs (ctx) == GA_NO_ERROR && @@ -127,8 +128,8 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. */ -static int axisInSet (unsigned v, - const unsigned* set, +static int axisInSet (int v, + const int* set, size_t setLen, size_t* where){ size_t i; @@ -190,7 +191,7 @@ static void appendIdxes (strb* s, */ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ - unsigned i; + int i; /** * We initialize certain parts of the context. @@ -216,13 +217,14 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ /* Insane src or reduxLen? */ if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 || - ctx->reduxLen == 0 || ctx->reduxLen >= ctx->src->nd){ + ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){ return ctx->ret=GA_INVALID_ERROR; } /* Insane or duplicate list entry? */ for(i=0;ireduxLen;i++){ - if(ctx->reduxList[i] >= ctx->src->nd || + if(ctx->reduxList[i] < 0 || + ctx->reduxList[i] >= (int)ctx->src->nd || axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ return ctx->ret=GA_INVALID_ERROR; } @@ -260,8 +262,8 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ */ static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ - unsigned i, j, maxI = 0; - size_t maxV; + int i, j, maxI = 0; + size_t maxV; ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3; @@ -355,7 +357,7 @@ static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t\n"); } static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ - unsigned i; + int i; strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); @@ -605,7 +607,7 @@ static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "#undef DSTAINDEXER\n"); } static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ - unsigned i, f=0; + int i, f=0; for(i=0;inds;i++){ if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ From cf702b565f4dcdbad7e3255f7b585003f687b2f6 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Nov 2016 12:14:49 -0400 Subject: [PATCH 054/597] Make all-dims-reduced usecase work. All-dims-reduced will be slow but does work now without errors. Added testcase to ensure this remains the case. --- src/gpuarray_reduction.c | 34 +++++++++------- tests/check_reduction.c | 88 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 15 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 9ca69d491e..0e6ba09749 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -364,24 +364,26 @@ static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - strb_appends(&ctx->s, "\tX "); - for(i=0;indh;i++){ - strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", - i, i, (i==ctx->ndh-1) ? ";\n" : ", "); + if(ctx->ndh>0){ + strb_appends(&ctx->s, "\tX "); + for(i=0;indh;i++){ + strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", + i, i, (i==ctx->ndh-1) ? ";\n" : ", "); + } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n"); - appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n"); + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} + if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} + if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} + if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); @@ -725,8 +727,10 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ } } - dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; - gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); + if(ctx->ndh > 0){ + dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; + gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); + } /** * Factorization job. We'll steadily increase the slack in case of failure @@ -806,7 +810,7 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ ctx->dstMaxStepsGD && ctx->dstArgmaxStepsGD){ ctx->ret = GpuKernel_call(&ctx->kernel, - ctx->ndh, + ctx->ndh>0 ? ctx->ndh : 1, ctx->blockSize, ctx->gridSize, 0, diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 106f6f3fc5..5138e5c02d 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -348,6 +348,93 @@ START_TEST(test_veryhighrank){ GpuArray_clear(&gaArgmax); }END_TEST +START_TEST(test_alldimsreduced){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on all dimensions. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,1,2}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMax = calloc(1, sizeof(*pMax) ); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = (i*dims[1] + j)*dims[2] + k; + } + } + } + } + + ck_assert_msg(gtMax == pMax[0], "Max value mismatch!"); + ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!"); + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); + GpuArray_clear(&gaArgmax); +}END_TEST + Suite *get_suite(void) { Suite *s = suite_create("reduction"); TCase *tc = tcase_create("basic"); @@ -357,6 +444,7 @@ Suite *get_suite(void) { tcase_add_test(tc, test_reduction); tcase_add_test(tc, test_idxtranspose); tcase_add_test(tc, test_veryhighrank); + tcase_add_test(tc, test_alldimsreduced); suite_add_tcase(s, tc); return s; From 6e8fbdd9ea07f13a5368e9e1a98947c3f0a0037c Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Nov 2016 21:18:58 -0400 Subject: [PATCH 055/597] Silence -Wsign-compare in src/gpuarray_array.c --- src/gpuarray_array.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index dd29487fae..0ceb07d97e 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -280,7 +280,7 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, return GA_VALUE_ERROR; } if (steps[i] == 0 && - (starts[i] == -1 || starts[i] >= a->dimensions[i])) { + (starts[i] == -1 || starts[i] >= (ssize_t)a->dimensions[i])) { free(newdims); free(newstrs); return GA_VALUE_ERROR; @@ -663,10 +663,10 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, for (ok = oi; ok < oj - 1; ok++) { if (ord == GA_F_ORDER) { - if (a->strides[ok+1] != a->dimensions[ok]*a->strides[ok]) + if (a->strides[ok+1] != (ssize_t)a->dimensions[ok]*a->strides[ok]) goto need_copy; } else { - if (a->strides[ok] != a->dimensions[ok+1]*a->strides[ok+1]) + if (a->strides[ok] != (ssize_t)a->dimensions[ok+1]*a->strides[ok+1]) goto need_copy; } } @@ -1125,7 +1125,7 @@ int GpuArray_is_c_contiguous(const GpuArray *a) { int i; for (i = a->nd - 1; i >= 0; i--) { - if (a->strides[i] != size) return 0; + if (a->strides[i] != (ssize_t)size) return 0; // We suppose that overflow will not happen since data has to fit in memory size *= a->dimensions[i]; } @@ -1137,7 +1137,7 @@ int GpuArray_is_f_contiguous(const GpuArray *a) { unsigned int i; for (i = 0; i < a->nd; i++) { - if (a->strides[i] != size) return 0; + if (a->strides[i] != (ssize_t)size) return 0; // We suppose that overflow will not happen since data has to fit in memory size *= a->dimensions[i]; } From 8bd4dddb8fe6f0ce9fe4cc0fe6f2ccc5aee7bb94 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Nov 2016 21:19:43 -0400 Subject: [PATCH 056/597] Silence -Wsign-compare in src/gpuarray_array_blas.c --- src/gpuarray_array_blas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index b8f0909be5..9fb6216054 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -370,7 +370,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph int err; gpudata **A_datas = NULL, **B_datas = NULL, **C_datas = NULL; size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL; - int i; + size_t i; if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) return GA_INVALID_ERROR; From 2acc38b6bcb05ed827cfc6884764db988a896481 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Nov 2016 21:20:23 -0400 Subject: [PATCH 057/597] Silence -Wsign-compare in src/gpuarray_buffer_cuda.c --- src/gpuarray_buffer_cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 0d1b5cb23a..70dbbc2a9b 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -871,7 +871,7 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err) { *err = get_cc(dev, &major, &minor); if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR; res = snprintf(ret, sz, "%s%d%d", prefix, major, minor); - if (res == -1 || res > sz) return GA_UNSUPPORTED_ERROR; + if (res == -1 || res > (ssize_t)sz) return GA_UNSUPPORTED_ERROR; return GA_NO_ERROR; } From 2503b29e5e3d628384f551de06820846d41eaec1 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Nov 2016 21:21:33 -0400 Subject: [PATCH 058/597] Silence -Wsign-compare in src/gpuarray_util.c --- src/gpuarray_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_util.c b/src/gpuarray_util.c index 177c632663..d0e134a592 100644 --- a/src/gpuarray_util.c +++ b/src/gpuarray_util.c @@ -173,7 +173,7 @@ void gpuarray_elemwise_collapse(unsigned int n, unsigned int *_nd, int collapse = 1; for (k = 0; k < n; k++) { collapse &= (strs[k] == NULL || - strs[k][i - 1] == dims[i] * strs[k][i]); + strs[k][i - 1] == (ssize_t)dims[i] * strs[k][i]); } if (collapse) { dims[i-1] *= dims[i]; From f30aa31456b140136acc2c278f90b09f2acf701a Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Nov 2016 21:22:26 -0400 Subject: [PATCH 059/597] Silence -Wempty-body and possible logic fault in src/ gpuarray_buffer_opencl.c --- src/gpuarray_buffer_opencl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index c8917d3778..2738621c1f 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1453,11 +1453,12 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, static const char *cl_error(gpucontext *c) { cl_ctx *ctx = (cl_ctx *)c; - if (ctx == NULL) + if (ctx == NULL){ return get_error_string(err); - else + }else{ ASSERT_CTX(ctx); return get_error_string(ctx->err); + } } GPUARRAY_LOCAL From 111819fe5f3d2f8d820c2d2626264e4b65497ff7 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 5 Nov 2016 00:52:40 -0400 Subject: [PATCH 060/597] Reverse signedness of comparison in sec/gpuarray_array.c --- src/gpuarray_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 0ceb07d97e..b91078f05f 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -280,7 +280,7 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, return GA_VALUE_ERROR; } if (steps[i] == 0 && - (starts[i] == -1 || starts[i] >= (ssize_t)a->dimensions[i])) { + (starts[i] == -1 || (size_t)starts[i] >= a->dimensions[i])) { free(newdims); free(newstrs); return GA_VALUE_ERROR; From 9f6b6dfd424dcddac52e893c427a22180ed002c4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 7 Nov 2016 15:51:09 -0500 Subject: [PATCH 061/597] Handle errors in repr for GpuArrays. --- pygpu/gpuarray.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 99332007a0..968e8a367f 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1978,7 +1978,10 @@ cdef class GpuArray: return str(numpy.asarray(self)) def __repr__(self): - return 'gpuarray.' + repr(numpy.asarray(self)) + try: + return 'gpuarray.' + repr(numpy.asarray(self)) + except Exception: + return 'gpuarray.array()' From e17f4929d2ef7076f00e107c613af6fde5813e5c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 7 Nov 2016 16:41:18 -0500 Subject: [PATCH 062/597] Check that we don't overflow int arguments in cublas wrapper. --- src/gpuarray/error.h | 1 + src/gpuarray_blas_cuda_cublas.c | 42 ++++++++++++++++++++++++++++++++- src/gpuarray_error.c | 1 + 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h index 1572145c0d..2c25bcd475 100644 --- a/src/gpuarray/error.h +++ b/src/gpuarray/error.h @@ -34,6 +34,7 @@ enum ga_error { GA_NODEV_ERROR, GA_MISC_ERROR, GA_COMM_ERROR, + GA_XLARGE_ERROR, /* Add more error types if needed, but at the end */ /* Don't forget to sync with Gpu_error() */ }; diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index d6a5d55f6e..3ebf8c3502 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -5,7 +5,9 @@ #include "gpuarray/kernel.h" #include "gpuarray/error.h" -#include "cublas_v2.h" +#include + +#include extern const gpuarray_buffer_ops cuda_ops; @@ -33,6 +35,8 @@ typedef struct _blas_handle { cublasStatus_t err; } blas_handle; +#define LARGE_VAL(v) (v >= INT_MAX) + static const char *code_sgemvBH_N_a1_b1_small = \ "extern \"C\"__global__ void sgemv(const float *A[], size_t lda, " \ " const float *x[], size_t incx, " \ @@ -326,6 +330,10 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(B); ASSERT_BUF(C); + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + return GA_XLARGE_ERROR; + if (order == cb_c) { /* swap A and B */ t = N; @@ -386,6 +394,10 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(B); ASSERT_BUF(C); + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + return GA_XLARGE_ERROR; + if (order == cb_c) { /* swap A and B */ t = N; @@ -450,6 +462,10 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(B); ASSERT_BUF(C); + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + return GA_XLARGE_ERROR; + if (order == cb_c) { /* swap A and B */ t = N; @@ -539,6 +555,10 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, if (batchCount == 0) return GA_NO_ERROR; + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + return GA_XLARGE_ERROR; + ASSERT_BUF(A[0]); ctx = A[0]->ctx; h = (blas_handle *)ctx->blas_handle; @@ -659,6 +679,10 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, if (batchCount == 0) return GA_NO_ERROR; + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + return GA_XLARGE_ERROR; + ASSERT_BUF(A[0]); ctx = A[0]->ctx; h = (blas_handle *)ctx->blas_handle; @@ -782,6 +806,10 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, ASSERT_BUF(X); ASSERT_BUF(Y); + if (LARGE_VAL(M) || LARGE_VAL(N) || + LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) + return GA_XLARGE_ERROR; + if (order == cb_c) { t = N; N = M; @@ -833,6 +861,10 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, ASSERT_BUF(X); ASSERT_BUF(Y); + if (LARGE_VAL(M) || LARGE_VAL(N) || + LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) + return GA_XLARGE_ERROR; + if (order == cb_c) { t = N; N = M; @@ -1149,6 +1181,10 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, ASSERT_BUF(Y); ASSERT_BUF(A); + if (LARGE_VAL(M) || LARGE_VAL(N) || + LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) + return GA_XLARGE_ERROR; + if (order == cb_c) { t = M; M = N; @@ -1202,6 +1238,10 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, ASSERT_BUF(Y); ASSERT_BUF(A); + if (LARGE_VAL(M) || LARGE_VAL(N) || + LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) + return GA_XLARGE_ERROR; + if (order == cb_c) { t = M; M = N; diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c index 5194a2af03..a01a640ce7 100644 --- a/src/gpuarray_error.c +++ b/src/gpuarray_error.c @@ -23,6 +23,7 @@ const char *gpuarray_error_str(int err) { case GA_NODEV_ERROR: return "No devices are available"; case GA_MISC_ERROR: return "Undeterminate error"; case GA_COMM_ERROR: return "Error in collectives call"; + case GA_XLARGE_ERROR: return "Input size too large for operation"; default: return "Unknown GA error"; } } From ce8c98d2469fb35eaa58b18b9cba4ca68dce5c14 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 7 Nov 2016 16:55:30 -0500 Subject: [PATCH 063/597] Also check products to protect cublas which doesn't do this check. --- src/gpuarray_blas_cuda_cublas.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 3ebf8c3502..f756c20651 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -331,7 +331,8 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || - LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; if (order == cb_c) { @@ -395,7 +396,8 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || - LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; if (order == cb_c) { @@ -463,7 +465,8 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || - LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; if (order == cb_c) { @@ -556,7 +559,8 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, if (batchCount == 0) return GA_NO_ERROR; if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || - LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; ASSERT_BUF(A[0]); @@ -680,7 +684,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, if (batchCount == 0) return GA_NO_ERROR; if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || - LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc)) + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; ASSERT_BUF(A[0]); @@ -806,7 +811,7 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, ASSERT_BUF(X); ASSERT_BUF(Y); - if (LARGE_VAL(M) || LARGE_VAL(N) || + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; @@ -861,7 +866,7 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, ASSERT_BUF(X); ASSERT_BUF(Y); - if (LARGE_VAL(M) || LARGE_VAL(N) || + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; @@ -1181,7 +1186,7 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, ASSERT_BUF(Y); ASSERT_BUF(A); - if (LARGE_VAL(M) || LARGE_VAL(N) || + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; @@ -1238,7 +1243,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, ASSERT_BUF(Y); ASSERT_BUF(A); - if (LARGE_VAL(M) || LARGE_VAL(N) || + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; From 3c504835e8ef55eadbc47f6c70ba8d37ec52f0ee Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 8 Nov 2016 22:15:48 -0500 Subject: [PATCH 064/597] Fix for scheduling in some corner cases. --- src/gpuarray_array.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index b91078f05f..fbd1aa777a 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -499,8 +499,11 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, pl = ls[0]; ls[0] = ls[1]; ls[1] = pl; + gs[0] = 1; + } else { + gs[0] = gs[1]; + gs[1] = 1; } - gs[0] = 1; argp = 0; GpuKernel_setarg(&k, argp++, a->data); From 2bcb60592b92debfd681e22a11861f5d4d4955d3 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 10 Nov 2016 10:54:58 -0500 Subject: [PATCH 065/597] Code added to get PCI Bus ID from a CUDA device. Some workaround has been done to get the same thing from OpenCL, but both OpenCL and CUDA don't print the same info. So for the moment, getting PCI Bus ID is available from CUDA and "not supported" for OpenCL (see commented code (will be removed after reviewing!)). Recall: PCI Bus ID is printed as: domain:bus:device.function where domain, bus, device and function are hexadecimal strings: domain: 16 bits info (4 characters) bus: 8 bits info (2 characters) device: 5 bits info (2 characters) function: 3 bits info (1 character) Reference: http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1gea264dad3d8c4898e0b82213c0253def http://www.makelinux.net/ldd3/chp-12-sect-1 --- .gitignore | 1 + pygpu/gpuarray.pxd | 1 + pygpu/gpuarray.pyx | 13 ++++++++++ src/gpuarray/buffer.h | 9 +++++++ src/gpuarray_buffer_cuda.c | 24 ++++++++++++++++++ src/gpuarray_buffer_opencl.c | 48 ++++++++++++++++++++++++++++++++++++ src/private_cuda.h | 2 ++ 7 files changed, 98 insertions(+) diff --git a/.gitignore b/.gitignore index 4261cf0d30..1bc8dee8c7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +Build build Debug Release diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index dd09afcbb2..c22504d4f3 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -87,6 +87,7 @@ cdef extern from "gpuarray/buffer.h": int GA_CTX_DISABLE_ALLOCATION_CACHE int GA_CTX_PROP_DEVNAME + int GA_CTX_PROP_PCIBUSID int GA_CTX_PROP_MAXLSIZE int GA_CTX_PROP_LMEMSIZE int GA_CTX_PROP_NUMPROCS diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 968e8a367f..8e45f1f227 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1016,6 +1016,19 @@ cdef class GpuContext: free(tmp) return res + property pcibusid: + "Device PCI Bus ID for this context" + def __get__(self): + cdef char *tmp + cdef unicode res + + ctx_property(self, GA_CTX_PROP_PCIBUSID, &tmp) + try: + res = tmp.decode('ascii') + finally: + free(tmp) + return res + property maxlsize: "Maximum size of thread block (local size) for this context" def __get__(self): diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index ee83a80f67..bd33e9f69c 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -680,6 +680,15 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); */ #define GA_CTX_PROP_COMM_OPS 18 +/** + * Get the device PCI Bus ID for the context. + * + * \note The returned string is allocated and must be freed by the caller. + * + * Type: `char *` + */ +#define GA_CTX_PROP_PCIBUSID 19 + /* Start at 512 for GA_BUFFER_PROP_ */ #define GA_BUFFER_PROP_START 512 diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 70dbbc2a9b..174f1de5b3 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1502,6 +1502,30 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, *((char **)res) = s; cuda_exit(ctx); return GA_NO_ERROR; + + case GA_CTX_PROP_PCIBUSID: + cuda_enter(ctx); + ctx->err = cuCtxGetDevice(&id); + if (ctx->err != CUDA_SUCCESS) { + cuda_exit(ctx); + return GA_IMPL_ERROR; + } + s = malloc(13); + if (s == NULL) { + cuda_exit(ctx); + return GA_MEMORY_ERROR; + } + ctx->err = cudaDeviceGetPCIBusId(s, 13, id); + if (ctx->err != CUDA_SUCCESS) { + /* PS: in GA_CTX_PROP_DEVNAME above, s is not freed here. + * I think it should be freed, isn't it ? */ + free(s); + cuda_exit(ctx); + return GA_IMPL_ERROR; + } + *((char **)res) = s; + cuda_exit(ctx); + return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 2738621c1f..3f3c105fda 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1152,6 +1152,14 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, size_t *psz; cl_device_id id; cl_uint ui; + /* For GA_CTX_PROP_PCIBUSID (currently desactivated). + * According to http://www.makelinux.net/ldd3/chp-12-sect-1 + * (accessed on 2016/11/09:15h41 EST): + * domain: 16 bits, bus: 8 bits, device: 5 bits, function: 3 bits. */ + /* + uint32_t* busid; + uint32_t domain = 0, bus = 0, device = 0, function = 0; + */ case GA_CTX_PROP_DEVNAME: ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), @@ -1172,6 +1180,46 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, *((char **)res) = s; return GA_NO_ERROR; + case GA_CTX_PROP_PCIBUSID: + /* PS: Currently desactivated. This does not print the same + * Bus ID as cuda and nvidia-smi. For the moment, I don't find + * which info will display the correct PCI Bus ID with OpenCL. */ + /* + ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), + &id, NULL); + if (ctx->err != CL_SUCCESS) + return GA_IMPL_ERROR; + ctx->err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, 0, NULL, &sz); + if (ctx->err != CL_SUCCESS) + return GA_IMPL_ERROR; + if(sz != 4) + return GA_IMPL_ERROR; + busid = malloc(sz); + if (busid == NULL) + return GA_MEMORY_ERROR; + ctx->err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, sz, busid, NULL); + if (ctx->err != CL_SUCCESS) { + free(busid); + return GA_IMPL_ERROR; + } + domain = *busid >> (32-16); + bus = *busid << 16 >> (32-8); + device = *busid << 24 >> (32-5); + function = *busid << 29 >> (32-3); + free(busid); + s = malloc(13); + sprintf(s, "%04x", domain); + sprintf(s + 5, "%02x", bus); + sprintf(s + 8, "%02x", device); + sprintf(s + 11, "%01x", function); + s[4] = s[7] = ':'; + s[10] = '.'; + *((char **)res) = s; + return GA_NO_ERROR; + */ + *((void **)res) = NULL; + return GA_DEVSUP_ERROR; + case GA_CTX_PROP_MAXLSIZE: ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL); diff --git a/src/private_cuda.h b/src/private_cuda.h index a85bd45956..481142c122 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -3,8 +3,10 @@ #ifdef __APPLE__ #include +#include #else #include +#include #endif #include From 2d6ee2a8fc735271a42d7b0c7bbed4bbfd8073aa Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 10 Nov 2016 13:08:37 -0500 Subject: [PATCH 066/597] Correction: using driver API instead of runtime API. Recompiled and retested successfully. --- src/gpuarray_buffer_cuda.c | 2 +- src/gpuarray_buffer_opencl.c | 45 +----------------------------------- src/private_cuda.h | 2 -- 3 files changed, 2 insertions(+), 47 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 174f1de5b3..77b04bbf07 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1515,7 +1515,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, cuda_exit(ctx); return GA_MEMORY_ERROR; } - ctx->err = cudaDeviceGetPCIBusId(s, 13, id); + ctx->err = cuDeviceGetPCIBusId(s, 13, id); if (ctx->err != CUDA_SUCCESS) { /* PS: in GA_CTX_PROP_DEVNAME above, s is not freed here. * I think it should be freed, isn't it ? */ diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 3f3c105fda..f48909e663 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1152,14 +1152,6 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, size_t *psz; cl_device_id id; cl_uint ui; - /* For GA_CTX_PROP_PCIBUSID (currently desactivated). - * According to http://www.makelinux.net/ldd3/chp-12-sect-1 - * (accessed on 2016/11/09:15h41 EST): - * domain: 16 bits, bus: 8 bits, device: 5 bits, function: 3 bits. */ - /* - uint32_t* busid; - uint32_t domain = 0, bus = 0, device = 0, function = 0; - */ case GA_CTX_PROP_DEVNAME: ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), @@ -1181,42 +1173,7 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_PCIBUSID: - /* PS: Currently desactivated. This does not print the same - * Bus ID as cuda and nvidia-smi. For the moment, I don't find - * which info will display the correct PCI Bus ID with OpenCL. */ - /* - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, 0, NULL, &sz); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - if(sz != 4) - return GA_IMPL_ERROR; - busid = malloc(sz); - if (busid == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, sz, busid, NULL); - if (ctx->err != CL_SUCCESS) { - free(busid); - return GA_IMPL_ERROR; - } - domain = *busid >> (32-16); - bus = *busid << 16 >> (32-8); - device = *busid << 24 >> (32-5); - function = *busid << 29 >> (32-3); - free(busid); - s = malloc(13); - sprintf(s, "%04x", domain); - sprintf(s + 5, "%02x", bus); - sprintf(s + 8, "%02x", device); - sprintf(s + 11, "%01x", function); - s[4] = s[7] = ':'; - s[10] = '.'; - *((char **)res) = s; - return GA_NO_ERROR; - */ + /* For the moment, PCI Bus ID is not supported for OpenCL. */ *((void **)res) = NULL; return GA_DEVSUP_ERROR; diff --git a/src/private_cuda.h b/src/private_cuda.h index 481142c122..a85bd45956 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -3,10 +3,8 @@ #ifdef __APPLE__ #include -#include #else #include -#include #endif #include From c9ceee1cbf339b7548912217070fc0ba6092dc96 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 21 Oct 2016 13:50:07 -0400 Subject: [PATCH 067/597] Add thin wrapper around library loading functions. --- src/util/dyn_load.c | 26 ++++++++++++++++++++++++++ src/util/dyn_load.h | 7 +++++++ 2 files changed, 33 insertions(+) create mode 100644 src/util/dyn_load.c create mode 100644 src/util/dyn_load.h diff --git a/src/util/dyn_load.c b/src/util/dyn_load.c new file mode 100644 index 0000000000..bc098ceaf5 --- /dev/null +++ b/src/util/dyn_load.c @@ -0,0 +1,26 @@ +#include "util/dyn_load.h" + +#ifdef __unix__ + +#include + +void *ga_load_library(const char *name) { + return dlopen(name, RTLD_LAZY|RTLD_LOCAL); +} + +void *ga_func_ptr(void *h, const char *name) { + return dlsym(h, name); +} + +#else +/* Should be windows */ + +void *ga_load_library(const char *name) { + return LoadLibrary(name); +} + +void *ga_func_ptr(void *h, const char *name) { + return (void *)GetProcAddress(h, name); +} + +#endif diff --git a/src/util/dyn_load.h b/src/util/dyn_load.h new file mode 100644 index 0000000000..73fea5d69f --- /dev/null +++ b/src/util/dyn_load.h @@ -0,0 +1,7 @@ +#ifndef UTIL_DYN_LOAD_H +#define UTIL_DYN_LOAD_H + +void *ga_load_library(const char *name); +void *ga_func_ptr(void *h, const char *name); + +#endif From 9776c9d37b399b834bb38d071bdb2157cbfbb1d7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 9 Nov 2016 14:24:59 -0500 Subject: [PATCH 068/597] Bump requirements to cuda 7.0 and require nvrtc. --- src/CMakeLists.txt | 25 +--- src/gpuarray_buffer_cuda.c | 188 +------------------------------ src/{util => loaders}/dyn_load.c | 0 src/{util => loaders}/dyn_load.h | 0 src/private_cuda.h | 4 - 5 files changed, 11 insertions(+), 206 deletions(-) rename src/{util => loaders}/dyn_load.c (100%) rename src/{util => loaders}/dyn_load.h (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1a8855d74a..ee908642d9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -78,18 +78,10 @@ if(NOT HAVE_MKSTEMP) endif() if (CUDA_FOUND) - if(NCCL_FOUND) - if (CUDA_VERSION_MAJOR LESS 7) - message( WARNING "This package requires CUDA 7.0 or more (building with NCCL). Found version ${CUDA_VERSION_STRING}") - set(CUDA_FOUND 0) - endif() - else(NCCL_FOUND) - if (CUDA_VERSION_MAJOR LESS 6 OR - (CUDA_VERSION_MAJOR EQUAL 6 AND CUDA_VERSION_MINOR EQUAL 0)) - message( WARNING "This package requires CUDA 6.5 or more. Found version ${CUDA_VERSION_STRING}") - set(CUDA_FOUND 0) - endif() - endif(NCCL_FOUND) + if (CUDA_VERSION_MAJOR LESS 7) + message( WARNING "This package requires CUDA 7.0 or more (building with NCCL). Found version ${CUDA_VERSION_STRING}") + set(CUDA_FOUND 0) + endif() endif() if (CUDA_FOUND) @@ -106,14 +98,7 @@ if (CUDA_FOUND) find_cuda_helper_libs(nvrtc) - if(CUDA_nvrtc_LIBRARY) - message(STATUS "Building with NVRTC") - add_definitions(-DWITH_NVRTC) - set(CUDADRV_LIBRARY ${CUDADRV_LIBRARY} ${CUDA_nvrtc_LIBRARY}) - else() - add_definitions(-DNVCC_BIN=${CUDA_NVCC_EXECUTABLE}) - endif() - + set(CUDADRV_LIBRARY ${CUDADRV_LIBRARY} ${CUDA_nvrtc_LIBRARY}) list(APPEND _GPUARRAY_SRC gpuarray_buffer_cuda.c) add_definitions(-DWITH_CUDA) include_directories(${CUDADRV_INCLUDE}) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 77b04bbf07..b54be3bdf3 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -2,6 +2,7 @@ #include "private.h" #include "private_cuda.h" +#include #include @@ -28,8 +29,11 @@ STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcme /* Allocations will be made in blocks of at least this size */ #define BLOCK_SIZE (4 * 1024 * 1024) -/* No returned allocations will be smaller than this size. - Also, they will be aligned to this size. */ +/* No returned allocations will be smaller than this size. Also, they + * will be aligned to this size. + * + * Some libraries depend on this value and will crash if it's smaller. + */ #define FRAG_SIZE (64) static CUresult err; @@ -150,9 +154,7 @@ static void deallocate(gpudata *); static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; gpudata *next, *curr; -#if CUDA_VERSION >= 7000 CUdevice dev; -#endif ASSERT_CTX(ctx); ctx->refcnt--; @@ -179,14 +181,10 @@ static void cuda_free_ctx(cuda_context *ctx) { cache_destroy(ctx->kernel_cache); if (!(ctx->flags & DONTFREE)) { -#if CUDA_VERSION < 7000 - cuCtxDestroy(ctx->ctx); -#else cuCtxPushCurrent(ctx->ctx); cuCtxGetDevice(&dev); cuCtxPopCurrent(NULL); cuDevicePrimaryCtxRelease(dev); -#endif } CLEAR(ctx); free(ctx); @@ -324,10 +322,8 @@ static cuda_context *do_init(CUdevice dev, int flags, int *ret) { cuda_context *res; CUcontext ctx; unsigned int fl = CU_CTX_SCHED_AUTO; -#if CUDA_VERSION >= 7000 unsigned int cur_fl; int act; -#endif int i; CHKFAIL(NULL); @@ -339,10 +335,6 @@ static cuda_context *do_init(CUdevice dev, int flags, int *ret) { CHKFAIL(NULL); if (i != 1) FAIL(NULL, GA_UNSUPPORTED_ERROR); -#if CUDA_VERSION < 7000 - err = cuCtxCreate(&ctx, fl, dev); - CHKFAIL(NULL); -#else err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act); CHKFAIL(NULL); if (act == 1) { @@ -356,14 +348,9 @@ static cuda_context *do_init(CUdevice dev, int flags, int *ret) { CHKFAIL(NULL); err = cuCtxPushCurrent(ctx); CHKFAIL(NULL); -#endif res = cuda_make_ctx(ctx, flags); if (res == NULL) { -#if CUDA_VERSION < 7000 - cuCtxDestroy(ctx); -#else cuDevicePrimaryCtxRelease(dev); -#endif FAIL(NULL, GA_IMPL_ERROR); } /* Don't leave the context on the thread stack */ @@ -875,10 +862,6 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err) { return GA_NO_ERROR; } -#ifdef WITH_NVRTC - -#include - static void *call_compiler(const char *src, size_t len, const char *arch_arg, size_t *bin_len, char **log, size_t *log_len, int *ret) { @@ -940,165 +923,6 @@ static void *call_compiler(const char *src, size_t len, const char *arch_arg, return buf; } -#else /* WITH_NVRTC */ - -#include - -#include -#include - -#ifdef _WIN32 -#include -/* I am really tired of hunting through online docs - * to find where the define is. 256 seem to be the - * consensus for the value so there it is. - */ -#define PATH_MAX 256 -#else -#include -#include -#endif - -#ifdef _MSC_VER -#include -#define read _read -#define write _write -#define close _close -#define unlink _unlink -#define fstat _fstat -#define open _open -#else -#include -#endif - -static const char *TMP_VAR_NAMES[] = {"GPUARRAY_TMPDIR", "TMPDIR", "TMP", - "TEMP", "USERPROFILE"}; - - -static void *call_compiler(const char *src, size_t len, const char *arch_arg, - size_t *bin_len, char **log, size_t *log_len, - int *ret) { - char namebuf[PATH_MAX]; - char outbuf[PATH_MAX]; - char *tmpdir; - struct stat st; - ssize_t s; -#ifndef _WIN32 - pid_t p; -#endif - unsigned int i; - int sys_err; - int fd; - char *buf; - - for (i = 0; i < sizeof(TMP_VAR_NAMES)/sizeof(TMP_VAR_NAMES[0]); i++) { - tmpdir = getenv(TMP_VAR_NAMES[i]); - if (tmpdir != NULL) break; - } - if (tmpdir == NULL) { -#ifdef _WIN32 - tmpdir = "."; -#else - tmpdir = "/tmp"; -#endif - } - - strlcpy(namebuf, tmpdir, sizeof(namebuf)); - strlcat(namebuf, "/gpuarray.cuda.XXXXXXXX", sizeof(namebuf)); - - fd = mkstemp(namebuf); - if (fd == -1) FAIL(NULL, GA_SYS_ERROR); - - strlcpy(outbuf, namebuf, sizeof(outbuf)); - strlcat(outbuf, ".cubin", sizeof(outbuf)); - - /* Don't want to write the final NUL */ - s = write(fd, src, len-1); - close(fd); - /* fd is not non-blocking; should have complete write */ - if (s == -1) { - unlink(namebuf); - FAIL(NULL, GA_SYS_ERROR); - } - - /* This block executes nvcc on the written-out file */ -#ifdef DEBUG -#define NVCC_ARGS NVCC_BIN, "-g", "-G", "-arch", arch_arg, "-x", "cu", \ - "--cubin", namebuf, "-o", outbuf -#else -#define NVCC_ARGS NVCC_BIN, "-arch", arch_arg, "-x", "cu", \ - "--cubin", namebuf, "-o", outbuf -#endif -#ifdef _WIN32 - sys_err = _spawnl(_P_WAIT, NVCC_BIN, NVCC_ARGS, NULL); - unlink(namebuf); - if (sys_err == -1) FAIL(NULL, GA_SYS_ERROR); - if (sys_err != 0) FAIL(NULL, GA_RUN_ERROR); -#else - p = fork(); - if (p == 0) { - execl(NVCC_BIN, NVCC_ARGS, NULL); - exit(1); - } - if (p == -1) { - unlink(namebuf); - FAIL(NULL, GA_SYS_ERROR); - } - - /* We need to wait until after the waitpid for the unlink because otherwise - we might delete the input file before nvcc is finished with it. */ - if (waitpid(p, &sys_err, 0) == -1) { - unlink(namebuf); - unlink(outbuf); - FAIL(NULL, GA_SYS_ERROR); - } else { -#ifdef DEBUG - /* Only cleanup if GPUARRAY_NOCLEANUP is not set */ - if (getenv("GPUARRAY_NOCLEANUP") == NULL) -#endif - unlink(namebuf); - } - - if (WIFSIGNALED(sys_err) || WEXITSTATUS(sys_err) != 0) { - unlink(outbuf); - FAIL(NULL, GA_RUN_ERROR); - } -#endif - - fd = open(outbuf, O_RDONLY); - if (fd == -1) { - unlink(outbuf); - FAIL(NULL, GA_SYS_ERROR); - } - - if (fstat(fd, &st) == -1) { - close(fd); - unlink(outbuf); - FAIL(NULL, GA_SYS_ERROR); - } - - buf = malloc((size_t)st.st_size); - if (buf == NULL) { - close(fd); - unlink(outbuf); - FAIL(NULL, GA_SYS_ERROR); - } - - s = read(fd, buf, (size_t)st.st_size); - close(fd); - unlink(outbuf); - /* fd is blocking; should have complete read */ - if (s == -1) { - free(buf); - FAIL(NULL, GA_SYS_ERROR); - } - - *bin_len = (size_t)st.st_size; - return buf; -} - -#endif /* WITH_NVRTC */ - static void _cuda_freekernel(gpukernel *k) { k->refcnt--; if (k->refcnt == 0) { diff --git a/src/util/dyn_load.c b/src/loaders/dyn_load.c similarity index 100% rename from src/util/dyn_load.c rename to src/loaders/dyn_load.c diff --git a/src/util/dyn_load.h b/src/loaders/dyn_load.h similarity index 100% rename from src/util/dyn_load.h rename to src/loaders/dyn_load.h diff --git a/src/private_cuda.h b/src/private_cuda.h index a85bd45956..2b59db6837 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -93,11 +93,7 @@ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext), * flag. */ -#ifdef WITH_NVRTC #define ARCH_PREFIX "compute_" -#else -#define ARCH_PREFIX "sm_" -#endif GPUARRAY_LOCAL cuda_context *cuda_make_ctx(CUcontext ctx, int flags); GPUARRAY_LOCAL CUstream cuda_get_stream(cuda_context *ctx); From eea7132a2b5a40121f40d08631aa7689c6e3dc08 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 9 Nov 2016 18:18:13 -0500 Subject: [PATCH 069/597] Stop allocating memory directly in the blas bindings. --- src/gpuarray_blas_cuda_cublas.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index f756c20651..b26ee117b3 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -621,7 +621,8 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, const float **A_l = (const float **)T_l; const float **B_l = (const float **)T_l + batchCount; float **C_l = T_l + (batchCount * 2); - CUdeviceptr Ta, Aa, Ba, Ca; + gpudata *Ta; + CUdeviceptr Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); @@ -635,12 +636,13 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, C_l[i] = ((float *)C[i]->ptr) + offC[i]; } - cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3); - Aa = Ta; - Ba = Ta + (batchCount * sizeof(float *)); - Ca = Ta + (batchCount * sizeof(float *) * 2); + Ta = gpudata_alloc((gpucontext *)ctx, sizeof(float *) * batchCount * 3, + NULL, 0, NULL); + Aa = *(CUdeviceptr *)Ta; + Ba = Aa + (batchCount * sizeof(float *)); + Ca = Aa + (batchCount * sizeof(float *) * 2); - cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3); + gpudata_write(Ta, 0, T_l, sizeof(float *) * batchCount * 3); h->err = cublasSgemmBatched(h->h, convT(transA), convT(transB), @@ -648,7 +650,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, (const float **)Aa, lda, (const float **)Ba, ldb, &beta, (float **)Ca, ldc, batchCount); - cuMemFree(Ta); + gpudata_release(Ta); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) @@ -746,7 +748,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); - CUdeviceptr Ta, Aa, Ba, Ca; + gpudata *Ta; + CUdeviceptr Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); @@ -760,12 +763,13 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, C_l[i] = ((double *)C[i]->ptr) + offC[i]; } - cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3); - Aa = Ta; - Ba = Ta + (batchCount * sizeof(double *)); - Ca = Ta + (batchCount * sizeof(double *) * 2); + Ta = gpudata_alloc((gpucontext *)ctx, sizeof(double *) * batchCount * 3, + NULL, 0, NULL); + Aa = *(CUdeviceptr *)Ta; + Ba = Aa + (batchCount * sizeof(double *)); + Ca = Aa + (batchCount * sizeof(double *) * 2); - cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3); + gpudata_write(Ta, 0, T_l, sizeof(double *) * batchCount * 3); h->err = cublasDgemmBatched(h->h, convT(transA), convT(transB), @@ -773,7 +777,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); - cuMemFree(Ta); + gpudata_release(Ta); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) From 34397f752c228e877ea5f9bd16f814a7179a33f5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 9 Nov 2016 17:36:32 -0500 Subject: [PATCH 070/597] Loader for libcuda (driver). --- src/CMakeLists.txt | 12 ++- src/gpuarray/error.h | 1 + src/gpuarray_buffer_cuda.c | 43 +++++---- src/gpuarray_error.c | 1 + src/loaders/CMakeLists.txt | 4 + src/loaders/dyn_load.c | 4 +- src/loaders/libcuda.c | 52 +++++++++++ src/loaders/libcuda.fn | 48 ++++++++++ src/loaders/libcuda.h | 176 +++++++++++++++++++++++++++++++++++++ src/private_cuda.h | 6 +- 10 files changed, 316 insertions(+), 31 deletions(-) create mode 100644 src/loaders/CMakeLists.txt create mode 100644 src/loaders/libcuda.c create mode 100644 src/loaders/libcuda.fn create mode 100644 src/loaders/libcuda.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ee908642d9..83e6dbd501 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -86,19 +86,16 @@ endif() if (CUDA_FOUND) if (APPLE) - FIND_LIBRARY(CUDADRV_LIBRARY CUDA) FIND_PATH(CUDADRV_INCLUDE CUDA/cuda.h) # this is somewhat a hack, but otherwise cublas_v2.h isn't found set(CUDADRV_INCLUDE ${CUDADRV_INCLUDE} ${CUDA_TOOLKIT_INCLUDE}) endif() - if(NOT CUDADRV_LIBRARY) - set(CUDADRV_LIBRARY ${CUDA_CUDA_LIBRARY}) + if(NOT CUDADRV_INCLUDE) set(CUDADRV_INCLUDE ${CUDA_TOOLKIT_INCLUDE}) endif() find_cuda_helper_libs(nvrtc) - set(CUDADRV_LIBRARY ${CUDADRV_LIBRARY} ${CUDA_nvrtc_LIBRARY}) list(APPEND _GPUARRAY_SRC gpuarray_buffer_cuda.c) add_definitions(-DWITH_CUDA) include_directories(${CUDADRV_INCLUDE}) @@ -146,9 +143,10 @@ configure_file( ) add_subdirectory(util) +add_subdirectory(loaders) set_rel(GPUARRAY_SRC ${_GPUARRAY_SRC}) -list(APPEND GPUARRAY_SRC ${UTIL_SRC}) +list(APPEND GPUARRAY_SRC ${UTIL_SRC} ${LOADERS_SRC}) add_library(gpuarray SHARED ${GPUARRAY_SRC}) set_target_properties(gpuarray PROPERTIES @@ -160,8 +158,8 @@ set_target_properties(gpuarray PROPERTIES add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) if(CUDA_FOUND) - target_link_libraries(gpuarray ${CUDADRV_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) - target_link_libraries(gpuarray-static ${CUDADRV_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) + target_link_libraries(gpuarray ${CUDA_nvrtc_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) + target_link_libraries(gpuarray-static ${CUDA_nvrtc_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) if (NCCL_FOUND) target_link_libraries(gpuarray ${NCCL_LIBRARY}) target_link_libraries(gpuarray-static ${NCCL_LIBRARY}) diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h index 2c25bcd475..af963c1531 100644 --- a/src/gpuarray/error.h +++ b/src/gpuarray/error.h @@ -35,6 +35,7 @@ enum ga_error { GA_MISC_ERROR, GA_COMM_ERROR, GA_XLARGE_ERROR, + GA_LOAD_ERROR, /* Add more error types if needed, but at the end */ /* Don't forget to sync with Gpu_error() */ }; diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index b54be3bdf3..1052d76fde 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -37,7 +37,6 @@ STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcme #define FRAG_SIZE (64) static CUresult err; -static int init_done = 0; GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops; @@ -61,6 +60,21 @@ static uint32_t strb_hash(void *_k) { return XXH32(k->s, k->l, 42); } +static int setup_done = 0; +static int setup_lib(void) { + int res; + if (!setup_done) { + res = load_libcuda(); + if (res != GA_NO_ERROR) + return err; + err = cuInit(0); + if (err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + setup_done = 1; + } + return GA_NO_ERROR; +} + static int cuda_get_platform_count(unsigned int* platcount) { *platcount = 1; // CUDA works on NVIDIA's GPUs return GA_NO_ERROR; @@ -70,12 +84,7 @@ static int cuda_get_device_count(unsigned int platform, unsigned int* devcount) { int dv; // platform number gets ignored in CUDA implementation - if (!init_done) { - err = cuInit(0); - if (err != CUDA_SUCCESS) - return GA_IMPL_ERROR; - init_done = 1; - } + GA_CHECK(setup_lib()); err = cuDeviceGetCount(&dv); if (err != CUDA_SUCCESS) return GA_IMPL_ERROR; @@ -86,6 +95,11 @@ static int cuda_get_device_count(unsigned int platform, cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; void *p; + int e; + + e = setup_lib(); + if (e != GA_NO_ERROR) + return NULL; res = calloc(1, sizeof(*res)); if (res == NULL) @@ -361,12 +375,11 @@ static cuda_context *do_init(CUdevice dev, int flags, int *ret) { static gpucontext *cuda_init(int ord, int flags, int *ret) { CUdevice dev; cuda_context *res; + int r; - if (!init_done) { - err = cuInit(0); - CHKFAIL(NULL); - init_done = 1; - } + r = setup_lib(); + if (r != GA_NO_ERROR) + return NULL; if (ord == -1) { int i, c; @@ -833,9 +846,6 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) { } static CUresult get_cc(CUdevice dev, int *maj, int *min) { -#if CUDA_VERSION < 6500 - return cuDeviceComputeCapability(maj, min, dev); -#else CUresult lerr; lerr = cuDeviceGetAttribute(maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, @@ -845,7 +855,6 @@ static CUresult get_cc(CUdevice dev, int *maj, int *min) { return cuDeviceGetAttribute(min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); -#endif } static int detect_arch(const char *prefix, char *ret, CUresult *err) { @@ -976,7 +985,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } - ctx->err = cuDeviceComputeCapability(&major, &minor, dev); + ctx->err = get_cc(dev, &major, &minor); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c index a01a640ce7..b7d5011f5b 100644 --- a/src/gpuarray_error.c +++ b/src/gpuarray_error.c @@ -24,6 +24,7 @@ const char *gpuarray_error_str(int err) { case GA_MISC_ERROR: return "Undeterminate error"; case GA_COMM_ERROR: return "Error in collectives call"; case GA_XLARGE_ERROR: return "Input size too large for operation"; + case GA_LOAD_ERROR: return "Error loading library"; default: return "Unknown GA error"; } } diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt new file mode 100644 index 0000000000..4d1553406a --- /dev/null +++ b/src/loaders/CMakeLists.txt @@ -0,0 +1,4 @@ +set_rel(LOADERS_SRC +dyn_load.c +libcuda.c +) \ No newline at end of file diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index bc098ceaf5..6742ace487 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -1,8 +1,8 @@ -#include "util/dyn_load.h" +#include "dyn_load.h" #ifdef __unix__ -#include +#include void *ga_load_library(const char *name) { return dlopen(name, RTLD_LAZY|RTLD_LOCAL); diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c new file mode 100644 index 0000000000..6cf7798c20 --- /dev/null +++ b/src/loaders/libcuda.c @@ -0,0 +1,52 @@ +#include + +#include "libcuda.h" +#include "dyn_load.h" +#include "gpuarray/error.h" +/* This code is strongly inspired from the dynamic loading code in the + * samples */ +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +static char libname[] = "nvcuda.dll"; +#else /* Unix */ +static char libname[] = "libcuda.so"; +#endif + +#define DEF_PROC(name, args) t##name *name +#define DEF_PROC_V2(name, args) DEF_PROC(name, args) + +#include "libcuda.fn" + +#undef DEF_PROC_V2 +#undef DEF_PROC + +#define STRINGIFY(X) #X + +#define DEF_PROC(name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +#define DEF_PROC_V2(name, args) \ + name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2)); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +static int loaded = 0; + +int load_libcuda(void) { + void *lib; + + if (loaded) + return GA_NO_ERROR; + + lib = ga_load_library(libname); + if (lib == NULL) + return GA_LOAD_ERROR; + + #include "libcuda.fn" + + loaded = 1; + return GA_NO_ERROR; +} diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn new file mode 100644 index 0000000000..05d3e84ad0 --- /dev/null +++ b/src/loaders/libcuda.fn @@ -0,0 +1,48 @@ +DEF_PROC(cuInit, (int flags)); +DEF_PROC(cuDriverGetVersion, (int *driverVersion)); +DEF_PROC(cuGetErrorString, (CUresult error, const char **pStr)); + +DEF_PROC(cuDeviceGet, (CUdevice *device, int ordinal)); +DEF_PROC(cuDeviceGetCount, (int *count)); +DEF_PROC(cuDeviceGetName, (char *name, int len, CUdevice dev)); +DEF_PROC(cuDeviceGetAttribute, (int *pi, CUdevice_attribute attrib, CUdevice dev)); + +DEF_PROC(cuDevicePrimaryCtxGetState, (CUdevice dev, unsigned int *flags, int *active)); +DEF_PROC(cuDevicePrimaryCtxSetFlags, (CUdevice dev, unsigned int flags)); +DEF_PROC(cuDevicePrimaryCtxRelease, (CUdevice dev)); +DEF_PROC(cuDevicePrimaryCtxRetain, (CUcontext *pctx, CUdevice dev)); + +DEF_PROC(cuCtxGetDevice, (CUdevice *device)); +DEF_PROC_V2(cuCtxPushCurrent, (CUcontext ctx)); +DEF_PROC_V2(cuCtxPopCurrent, (CUcontext *pctx)); + +DEF_PROC(cuModuleLoadData, (CUmodule *module, const void *image)); +DEF_PROC(cuModuleUnload, (CUmodule hmod)); +DEF_PROC(cuModuleGetFunction, (CUfunction *hfunc, CUmodule hmod, const char *name)); + +DEF_PROC_V2(cuMemGetInfo, (size_t *free, size_t *total)); +DEF_PROC_V2(cuMemAlloc, (CUdeviceptr *dptr, size_t bytesize)); +DEF_PROC_V2(cuMemFree, (CUdeviceptr dptr)); +DEF_PROC_V2(cuMemAllocHost, (void **pp, size_t bytesize)); +DEF_PROC(cuMemFreeHost, (void *p)); + +DEF_PROC_V2(cuMemcpyHtoDAsync, (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream)); +DEF_PROC_V2(cuMemcpyHtoD, (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)); +DEF_PROC_V2(cuMemcpyDtoHAsync, (void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)); +DEF_PROC_V2(cuMemcpyDtoDAsync, (CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)); +DEF_PROC(cuMemcpyPeerAsync, (CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream)); +DEF_PROC(cuMemsetD8Async, (CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream)); + +DEF_PROC(cuLaunchKernel, (CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)); + +DEF_PROC(cuFuncGetAttribute, (int *pi, CUfunction_attribute attrib, CUfunction hfunc)); + +DEF_PROC(cuEventCreate, (CUevent *phEvent, unsigned int Flags)); +DEF_PROC(cuEventRecord, (CUevent hEvent, CUstream hStream)); +DEF_PROC(cuEventSynchronize, (CUevent hEvent)); +DEF_PROC_V2(cuEventDestroy, (CUevent hEvent)); + +DEF_PROC(cuStreamCreate, (CUstream *phStream, unsigned int Flags)); +DEF_PROC(cuStreamWaitEvent, (CUstream hStream, CUevent hEvent, unsigned int Flags)); +DEF_PROC(cuStreamSynchronize, (CUstream hStream)); +DEF_PROC_V2(cuStreamDestroy, (CUstream hStream)); diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h new file mode 100644 index 0000000000..0fdae34e53 --- /dev/null +++ b/src/loaders/libcuda.h @@ -0,0 +1,176 @@ +#ifndef LOADER_LIBCUDA_H +#define LOADER_LIBCUDA_H + +#ifdef _WIN32 +#define CUDAAPI __stdcall +#else +#define CUDAAPI +#endif + +typedef enum { + CUDA_SUCCESS = 0 +} CUresult; + +#if defined(_WIN64) || defined(__LP64__) +typedef unsigned long long CUdeviceptr; +#else +typedef unsigned int CUdeviceptr; +#endif + +typedef int CUdevice; +typedef struct CUctx_st *CUcontext; +typedef struct CUmod_st *CUmodule; +typedef struct CUfunc_st *CUfunction; +typedef struct CUevent_st *CUevent; +typedef struct CUstream_st *CUstream; + +typedef enum CUdevice_attribute_enum CUdevice_attribute; +typedef enum CUfunction_attribute_enum CUfunction_attribute; +typedef enum CUevent_flags_enum CUevent_flags; +typedef enum CUctx_flags_enum CUctx_flags; + +int load_libcuda(void); + +#define DEF_PROC(name, args) typedef CUresult CUDAAPI t##name args +#define DEF_PROC_V2(name, args) DEF_PROC(name, args) + +#include "libcuda.fn" + +#undef DEF_PROC_V2 +#undef DEF_PROC + +#define DEF_PROC(name, args) extern t##name *name +#define DEF_PROC_V2(name, args) DEF_PROC(name, args) + +#include "libcuda.fn" + +#undef DEF_PROC_V2 +#undef DEF_PROC + +enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, + CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, + CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, + CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91 +}; + +enum CUfunction_attribute_enum { + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7 +}; + +enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0x0, + CU_EVENT_BLOCKING_SYNC = 0x1, + CU_EVENT_DISABLE_TIMING = 0x2, + CU_EVENT_INTERPROCESS = 0x4 +}; + +enum CUctx_flags_enum { + CU_CTX_SCHED_AUTO = 0x00, + CU_CTX_SCHED_SPIN = 0x01, + CU_CTX_SCHED_YIELD = 0x02, + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, + CU_CTX_BLOCKING_SYNC = 0x04, + CU_CTX_MAP_HOST = 0x08, +}; + +#endif diff --git a/src/private_cuda.h b/src/private_cuda.h index 2b59db6837..0fd9d138eb 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -1,11 +1,7 @@ #ifndef _PRIVATE_CUDA_H #define _PRIVATE_CUDA_H -#ifdef __APPLE__ -#include -#else -#include -#endif +#include "loaders/libcuda.h" #include From d70ce00f2fb1ab19e1eccdeb3d068d0c14158463 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 9 Nov 2016 19:35:26 -0500 Subject: [PATCH 071/597] Add loader for nvrtc. --- src/CMakeLists.txt | 6 ++--- src/gpuarray_buffer_cuda.c | 21 ++++++++++++++-- src/loaders/CMakeLists.txt | 1 + src/loaders/libnvrtc.c | 50 ++++++++++++++++++++++++++++++++++++++ src/loaders/libnvrtc.fn | 7 ++++++ src/loaders/libnvrtc.h | 24 ++++++++++++++++++ 6 files changed, 103 insertions(+), 6 deletions(-) create mode 100644 src/loaders/libnvrtc.c create mode 100644 src/loaders/libnvrtc.fn create mode 100644 src/loaders/libnvrtc.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 83e6dbd501..606113b7c5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -94,8 +94,6 @@ if (CUDA_FOUND) set(CUDADRV_INCLUDE ${CUDA_TOOLKIT_INCLUDE}) endif() - find_cuda_helper_libs(nvrtc) - list(APPEND _GPUARRAY_SRC gpuarray_buffer_cuda.c) add_definitions(-DWITH_CUDA) include_directories(${CUDADRV_INCLUDE}) @@ -158,8 +156,8 @@ set_target_properties(gpuarray PROPERTIES add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) if(CUDA_FOUND) - target_link_libraries(gpuarray ${CUDA_nvrtc_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) - target_link_libraries(gpuarray-static ${CUDA_nvrtc_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) + target_link_libraries(gpuarray ${CUDA_CUBLAS_LIBRARIES}) + target_link_libraries(gpuarray-static ${CUDA_CUBLAS_LIBRARIES}) if (NCCL_FOUND) target_link_libraries(gpuarray ${NCCL_LIBRARY}) target_link_libraries(gpuarray-static ${NCCL_LIBRARY}) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 1052d76fde..b0188f52ab 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -2,7 +2,7 @@ #include "private.h" #include "private_cuda.h" -#include +#include "loaders/libnvrtc.h" #include @@ -62,7 +62,8 @@ static uint32_t strb_hash(void *_k) { static int setup_done = 0; static int setup_lib(void) { - int res; + int res, major, minor, tmp; + const char *ver; if (!setup_done) { res = load_libcuda(); if (res != GA_NO_ERROR) @@ -70,6 +71,22 @@ static int setup_lib(void) { err = cuInit(0); if (err != CUDA_SUCCESS) return GA_IMPL_ERROR; + ver = getenv("GPUARRAY_CUDA_VERSION"); + if (ver == NULL || strlen(ver) != 2) { + err = gcuDriverGetVersion(&tmp); + if (err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + major = tmp / 1000; + minor = (tmp / 10) % 10; + } else { + major = ver[0] - '0'; + minor = ver[1] - '0'; + } + if (major > 9 || major < 0 || minor > 9 || minor < 0) + return GA_VALUE_ERROR; + res = load_libnvrtc(major, minor); + if (res != GA_NO_ERROR) + return err; setup_done = 1; } return GA_NO_ERROR; diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt index 4d1553406a..66661c4408 100644 --- a/src/loaders/CMakeLists.txt +++ b/src/loaders/CMakeLists.txt @@ -1,4 +1,5 @@ set_rel(LOADERS_SRC dyn_load.c libcuda.c +libnvrtc.c ) \ No newline at end of file diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c new file mode 100644 index 0000000000..5fdf24f4e2 --- /dev/null +++ b/src/loaders/libnvrtc.c @@ -0,0 +1,50 @@ +#include + +#include "libcuda.h" +#include "libnvrtc.h" +#include "dyn_load.h" +#include "gpuarray/error.h" + +/* This code is strongly inspired from the dynamic loading code in the + * samples */ + +#define DEF_PROC(name, args) t##name *name + +#include "libnvrtc.fn" + +#undef DEF_PROC + +#define DEF_PROC(name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +static int loaded = 0; + +int load_libnvrtc(int major, int minor) { + void *lib; + + if (loaded) + return GA_NO_ERROR; + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + { + char libname[] = "nvrtc64_??.dll"; + + libname[8] = DIGITS[major]; + libname[9] = DIGITS[minor]; + + lib = ga_load_library(libname); + } +#else /* Unix */ + lib = ga_load_library("libnvrtc.so"); +#endif + if (lib == NULL) + return GA_LOAD_ERROR; + + #include "libnvrtc.fn" + + loaded = 1; + return GA_NO_ERROR; +} diff --git a/src/loaders/libnvrtc.fn b/src/loaders/libnvrtc.fn new file mode 100644 index 0000000000..9ebda14112 --- /dev/null +++ b/src/loaders/libnvrtc.fn @@ -0,0 +1,7 @@ +DEF_PROC(nvrtcCreateProgram, (nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames)); +DEF_PROC(nvrtcCompileProgram, (nvrtcProgram prog, int numOptions, const char **options)); +DEF_PROC(nvrtcDestroyProgram, (nvrtcProgram *prog)); +DEF_PROC(nvrtcGetProgramLog, (nvrtcProgram prog, char *log)); +DEF_PROC(nvrtcGetProgramLogSize, (nvrtcProgram prog, size_t *logSizeRet)); +DEF_PROC(nvrtcGetPTX, (nvrtcProgram prog, char *ptx)); +DEF_PROC(nvrtcGetPTXSize, (nvrtcProgram prog, size_t *ptxSizeRet)); \ No newline at end of file diff --git a/src/loaders/libnvrtc.h b/src/loaders/libnvrtc.h new file mode 100644 index 0000000000..e06aa45042 --- /dev/null +++ b/src/loaders/libnvrtc.h @@ -0,0 +1,24 @@ +#ifndef LOADER_LIBNVRTC_H +#define LOADER_LIBNVRTC_H + +typedef enum { + NVRTC_SUCCESS = 0, +} nvrtcResult; + +typedef struct _nvrtcProgram *nvrtcProgram; + +int load_libnvrtc(int major, int minor); + +#define DEF_PROC(name, args) typedef nvrtcResult t##name args + +#include "libnvrtc.fn" + +#undef DEF_PROC + +#define DEF_PROC(name, args) extern t##name *name + +#include "libnvrtc.fn" + +#undef DEF_PROC + +#endif From 230e9ed9d7c3e56df961ebfe141d3d55aca38fd5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Nov 2016 15:03:42 -0500 Subject: [PATCH 072/597] Add a loader for cublas. --- src/CMakeLists.txt | 10 ---- src/gpuarray_blas_cuda_cublas.c | 37 ++++++--------- src/gpuarray_buffer_cuda.c | 17 +++---- src/loaders/CMakeLists.txt | 3 +- src/loaders/libcublas.c | 63 +++++++++++++++++++++++++ src/loaders/libcublas.fn | 21 +++++++++ src/loaders/libcublas.h | 84 +++++++++++++++++++++++++++++++++ src/private_cuda.h | 2 + 8 files changed, 193 insertions(+), 44 deletions(-) create mode 100644 src/loaders/libcublas.c create mode 100644 src/loaders/libcublas.fn create mode 100644 src/loaders/libcublas.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 606113b7c5..76bd12f959 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -99,14 +99,6 @@ if (CUDA_FOUND) include_directories(${CUDADRV_INCLUDE}) list(APPEND _GPUARRAY_SRC gpuarray_blas_cuda_cublas.c) - add_definitions(-DWITH_CUDA_CUBLAS) - - set(CMAKE_REQUIRED_LIBRARIES ${CUDA_CUBLAS_LIBRARIES}) - - check_function_exists(cublasSgemmEx CUBLAS_SGEMMEX) - if (CUBLAS_SGEMMEX) - add_definitions(-DHAVE_CUBLAS_SGEMMEX) - endif() if(NCCL_FOUND) message(STATUS "Building with NCCL") @@ -156,8 +148,6 @@ set_target_properties(gpuarray PROPERTIES add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) if(CUDA_FOUND) - target_link_libraries(gpuarray ${CUDA_CUBLAS_LIBRARIES}) - target_link_libraries(gpuarray-static ${CUDA_CUBLAS_LIBRARIES}) if (NCCL_FOUND) target_link_libraries(gpuarray ${NCCL_LIBRARY}) target_link_libraries(gpuarray-static ${NCCL_LIBRARY}) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index b26ee117b3..134c7438d3 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -7,7 +7,7 @@ #include -#include +#include "loaders/libcublas.h" extern const gpuarray_buffer_ops cuda_ops; @@ -176,6 +176,10 @@ static int setup(gpucontext *c) { if (ctx->blas_handle != NULL) return GA_NO_ERROR; + e = load_libcublas(ctx->minor, ctx->major); + if (e != GA_NO_ERROR) + return e; + handle = calloc(1, sizeof(*handle)); if (handle == NULL) return GA_MEMORY_ERROR; @@ -450,7 +454,6 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { -#ifdef HAVE_CUBLAS_SGEMMEX /* This will use float32 for computation as it's the best we can * have right now. In the future when native float16 support will be * there we will switch to that. */ @@ -464,6 +467,9 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(B); ASSERT_BUF(C); + if (cublasSgemmEx == NULL) + return GA_DEVSUP_ERROR; + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) @@ -497,23 +503,11 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, h->err = cublasSgemmEx(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((uint16_t *)A->ptr) + offA, -#if CUDA_VERSION >= 8000 CUDA_R_16F, -#else - CUBLAS_DATA_HALF, -#endif lda, ((uint16_t *)B->ptr) + offB, -#if CUDA_VERSION >= 8000 CUDA_R_16F, -#else - CUBLAS_DATA_HALF, -#endif ldb, &beta, ((uint16_t *)C->ptr) + offC, -#if CUDA_VERSION >= 8000 CUDA_R_16F, -#else - CUBLAS_DATA_HALF, -#endif ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); @@ -528,9 +522,6 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, cuda_exit(ctx); return GA_NO_ERROR; -#else - return GA_DEVSUP_ERROR; -#endif } static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, @@ -1216,9 +1207,9 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); h->err = cublasSger(h->h, M, N, &alpha, - ((float *)X->ptr) + offX, incX, - ((float *)Y->ptr) + offY, incY, - ((float *)A->ptr) + offA, lda); + ((float *)X->ptr) + offX, incX, + ((float *)Y->ptr) + offY, incY, + ((float *)A->ptr) + offA, lda); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) @@ -1273,9 +1264,9 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); h->err = cublasDger(h->h, M, N, &alpha, - ((double *)X->ptr) + offX, incX, - ((double *)Y->ptr) + offY, incY, - ((double *)A->ptr) + offA, lda); + ((double *)X->ptr) + offX, incX, + ((double *)Y->ptr) + offY, incY, + ((double *)A->ptr) + offA, lda); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index b0188f52ab..d2354f831b 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -61,8 +61,10 @@ static uint32_t strb_hash(void *_k) { } static int setup_done = 0; +static int major = -1; +static int minor = -1; static int setup_lib(void) { - int res, major, minor, tmp; + int res, tmp; const char *ver; if (!setup_done) { res = load_libcuda(); @@ -127,6 +129,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { res->refcnt = 1; res->flags = flags; res->enter = 0; + res->major = major; + res->minor = minor; res->freeblocks = NULL; if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { goto fail_stream; @@ -192,8 +196,8 @@ static void cuda_free_ctx(cuda_context *ctx) { if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { - ctx->err = cuda_property((gpucontext *)ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, - &blas_ops); + cuda_property((gpucontext *)ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, + &blas_ops); blas_ops->teardown((gpucontext *)ctx); } cuMemFreeHost((void *)ctx->errbuf->ptr); @@ -1293,9 +1297,7 @@ static int cuda_transfer(gpudata *dst, size_t dstoff, return GA_NO_ERROR; } -#ifdef WITH_CUDA_CUBLAS extern gpuarray_blas_ops cublas_ops; -#endif // WITH_CUDA_CUBLAS #ifdef WITH_CUDA_NCCL extern gpuarray_comm_ops nccl_ops; #endif // WITH_CUDA_NCCL @@ -1447,13 +1449,8 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: -#ifdef WITH_CUDA_CUBLAS *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; -#else - *((void **)res) = NULL; - return GA_DEVSUP_ERROR; -#endif // WITH_CUDA_CUBLAS case GA_CTX_PROP_COMM_OPS: #ifdef WITH_CUDA_NCCL diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt index 66661c4408..58b7ddca0c 100644 --- a/src/loaders/CMakeLists.txt +++ b/src/loaders/CMakeLists.txt @@ -2,4 +2,5 @@ set_rel(LOADERS_SRC dyn_load.c libcuda.c libnvrtc.c -) \ No newline at end of file +libcublas.c +) diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c new file mode 100644 index 0000000000..afa81f440f --- /dev/null +++ b/src/loaders/libcublas.c @@ -0,0 +1,63 @@ +#include + +#include "libcublas.h" +#include "dyn_load.h" +#include "gpuarray/error.h" +/* This code is strongly inspired from the dynamic loading code in the + * samples */ + +#define DEF_PROC(name, args) t##name *name +#define DEF_PROC_V2(name, args) DEF_PROC(name, args) +#define DEF_PROC_OPT(name, args) DEF_PROC(name, args) + +#include "libcublas.fn" + +#undef DEF_PROC_OPT +#undef DEF_PROC_V2 +#undef DEF_PROC + +#define STRINGIFY(X) #X + +#define DEF_PROC(name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +#define DEF_PROC_OPT(name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); + +#define DEF_PROC_V2(name, args) \ + name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2)); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +static int loaded = 0; + +int load_libcublas(int major, int minor) { + void *lib; + + if (loaded) + return GA_NO_ERROR; + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + { + char libname[] = "cublas64_??.dll"; + + libname[9] = DIGITS[major]; + libname[10] = DIGITS[minor]; + + lib = ga_load_library(libname); + } +#else /* Unix */ + lib = ga_load_library("libcublas.so"); +#endif + if (lib == NULL) + return GA_LOAD_ERROR; + +#include "libcublas.fn" + + loaded = 1; + return GA_NO_ERROR; +} diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn new file mode 100644 index 0000000000..04b0290800 --- /dev/null +++ b/src/loaders/libcublas.fn @@ -0,0 +1,21 @@ +DEF_PROC_V2(cublasCreate, (cublasHandle_t *handle)); +DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle)); + +DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId)); +DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode)); +DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode)); + + +DEF_PROC_V2(cublasSgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)); +DEF_PROC_V2(cublasDgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)); + +DEF_PROC_V2(cublasSgemv, (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy)); +DEF_PROC_V2(cublasDgemv, (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *alpha, const double *A, int lda, const double *x, int incx, const double *beta, double *y, int incy)); + +DEF_PROC_V2(cublasSger, (cublasHandle_t handle, int m, int n, const float *alpha, const float *x, int incx, const float *y, int incy, float *A, int lda)); +DEF_PROC_V2(cublasDger, (cublasHandle_t handle, int m, int n, const double *alpha, const double *x, int incx, const double *y, int incy, double *A, int lda)); + +DEF_PROC_OPT(cublasSgemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const float *beta, void *C, cudaDataType Ctype, int ldc)); + +DEF_PROC(cublasSgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, const float *beta, float *Carray[], int ldc, int batchCount)); +DEF_PROC(cublasDgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, const double *beta, double *Carray[], int ldc, int batchCount)); diff --git a/src/loaders/libcublas.h b/src/loaders/libcublas.h new file mode 100644 index 0000000000..71955e3797 --- /dev/null +++ b/src/loaders/libcublas.h @@ -0,0 +1,84 @@ +#ifndef LOADER_LIBCUBLAS_H +#define LOADER_LIBCUBLAS_H + +#ifdef _WIN32 +#define CUBLASWINAPI __stdcall +#else +#define CUBLASWINAPI +#endif + +typedef enum cudaDataType_t +{ + CUDA_R_16F= 2, // real as a half + CUDA_C_16F= 6, // complex as a pair of half numbers + CUDA_R_32F= 0, // real as a float + CUDA_C_32F= 4, // complex as a pair of float numbers + CUDA_R_64F= 1, // real as a double + CUDA_C_64F= 5, // complex as a pair of double numbers + CUDA_R_8I= 3, // real as a signed char + CUDA_C_8I= 7, // complex as a pair of signed char numbers + CUDA_R_8U= 8, // real as a unsigned char + CUDA_C_8U= 9, // complex as a pair of unsigned char numbers + CUDA_R_32I= 10, // real as a signed int + CUDA_C_32I= 11, // complex as a pair of signed int numbers + CUDA_R_32U= 12, // real as a unsigned int + CUDA_C_32U= 13 // complex as a pair of unsigned int numbers +} cudaDataType; + +typedef struct CUstream_st *cudaStream_t; + +typedef enum { + CUBLAS_STATUS_SUCCESS =0, + CUBLAS_STATUS_NOT_INITIALIZED =1, + CUBLAS_STATUS_ALLOC_FAILED =3, + CUBLAS_STATUS_INVALID_VALUE =7, + CUBLAS_STATUS_ARCH_MISMATCH =8, + CUBLAS_STATUS_MAPPING_ERROR =11, + CUBLAS_STATUS_EXECUTION_FAILED=13, + CUBLAS_STATUS_INTERNAL_ERROR =14, + CUBLAS_STATUS_NOT_SUPPORTED =15, + CUBLAS_STATUS_LICENSE_ERROR =16 +} cublasStatus_t; + +typedef enum { + CUBLAS_OP_N=0, + CUBLAS_OP_T=1, + CUBLAS_OP_C=2 +} cublasOperation_t; + +typedef enum { + CUBLAS_POINTER_MODE_HOST = 0, + CUBLAS_POINTER_MODE_DEVICE = 1 +} cublasPointerMode_t; + +typedef enum { + CUBLAS_ATOMICS_NOT_ALLOWED = 0, + CUBLAS_ATOMICS_ALLOWED = 1 +} cublasAtomicsMode_t; + +typedef struct cublasContext *cublasHandle_t; + + +int load_libcublas(int major, int minor); + +#define DEF_PROC(name, args) typedef cublasStatus_t CUBLASWINAPI t##name args +#define DEF_PROC_V2(name, args) DEF_PROC(name, args) +#define DEF_PROC_OPT(name, args) DEF_PROC(name, args) + +#include "libcublas.fn" + +#undef DEF_PROC_OPT +#undef DEF_PROC_V2 +#undef DEF_PROC + +#define DEF_PROC(name, args) extern t##name *name +#define DEF_PROC_V2(name, args) DEF_PROC(name, args) +#define DEF_PROC_OPT(name, args) DEF_PROC(name, args) + +#include "libcublas.fn" + +#undef DEF_PROC_OPT +#undef DEF_PROC_V2 +#undef DEF_PROC + +#endif diff --git a/src/private_cuda.h b/src/private_cuda.h index 0fd9d138eb..6fab1597ac 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -69,6 +69,8 @@ typedef struct _cuda_context { gpudata *freeblocks; cache *kernel_cache; unsigned int enter; + unsigned char major; + unsigned char minor; } cuda_context; STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext), From 398d3460199344eb9f48f1c3d581481c603ef910 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Nov 2016 15:26:14 -0500 Subject: [PATCH 073/597] Make cuda required to build the package. --- src/CMakeLists.txt | 59 +++++++++++++++++----------------------- src/gpuarray_buffer.c | 4 --- src/gpuarray_extension.c | 4 --- 3 files changed, 25 insertions(+), 42 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 76bd12f959..bf7b3c3bf0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,9 @@ if(CMAKE_COMPILER_IS_GNUCC) add_definitions(-Wdeclaration-after-statement) endif() -find_package(CUDA) +find_package(CUDA REQUIRED) +find_package(NCCL) + find_package(OpenCL) if(OpenCL_FOUND) find_package(clBLAS) @@ -13,9 +15,6 @@ if(NOT CLBLAS_FOUND) find_package(CLBlast) endif() endif() -if(CUDA_FOUND) -find_package(NCCL) -endif() include_directories("${CMAKE_CURRENT_SOURCE_DIR}") @@ -60,6 +59,8 @@ gpuarray_kernel.c gpuarray_extension.c gpuarray_elemwise.c gpuarray_reduction.c +gpuarray_buffer_cuda.c +gpuarray_blas_cuda_cublas.c ) check_function_exists(strlcat HAVE_STRL) @@ -77,36 +78,28 @@ if(NOT HAVE_MKSTEMP) list(APPEND _GPUARRAY_SRC gpuarray_mkstemp.c) endif() -if (CUDA_FOUND) - if (CUDA_VERSION_MAJOR LESS 7) - message( WARNING "This package requires CUDA 7.0 or more (building with NCCL). Found version ${CUDA_VERSION_STRING}") - set(CUDA_FOUND 0) - endif() +if (CUDA_VERSION_MAJOR LESS 7) + message( FATAL_ERROR "This package requires CUDA 7.0 or more. Found version ${CUDA_VERSION_STRING}") endif() -if (CUDA_FOUND) - if (APPLE) - FIND_PATH(CUDADRV_INCLUDE CUDA/cuda.h) - # this is somewhat a hack, but otherwise cublas_v2.h isn't found - set(CUDADRV_INCLUDE ${CUDADRV_INCLUDE} ${CUDA_TOOLKIT_INCLUDE}) - endif() - if(NOT CUDADRV_INCLUDE) - set(CUDADRV_INCLUDE ${CUDA_TOOLKIT_INCLUDE}) - endif() +if (APPLE) + FIND_PATH(CUDADRV_INCLUDE CUDA/cuda.h) + # this is somewhat a hack, but otherwise cublas_v2.h isn't found + set(CUDADRV_INCLUDE ${CUDADRV_INCLUDE} ${CUDA_TOOLKIT_INCLUDE}) +endif() +if(NOT CUDADRV_INCLUDE) + set(CUDADRV_INCLUDE ${CUDA_TOOLKIT_INCLUDE}) +endif() - list(APPEND _GPUARRAY_SRC gpuarray_buffer_cuda.c) - add_definitions(-DWITH_CUDA) - include_directories(${CUDADRV_INCLUDE}) +include_directories(${CUDADRV_INCLUDE}) - list(APPEND _GPUARRAY_SRC gpuarray_blas_cuda_cublas.c) - if(NCCL_FOUND) - message(STATUS "Building with NCCL") - set(BUILD_WITH_COLLECTIVES 1 PARENT_SCOPE) - add_definitions(-DWITH_CUDA_NCCL) - list(APPEND _GPUARRAY_SRC gpuarray_collectives_cuda_nccl.c) - include_directories(${NCCL_INCLUDE_DIR}) - endif() +if(NCCL_FOUND) + message(STATUS "Building with NCCL") + set(BUILD_WITH_COLLECTIVES 1 PARENT_SCOPE) + add_definitions(-DWITH_CUDA_NCCL) + list(APPEND _GPUARRAY_SRC gpuarray_collectives_cuda_nccl.c) + include_directories(${NCCL_INCLUDE_DIR}) endif() if(OpenCL_FOUND) @@ -147,11 +140,9 @@ set_target_properties(gpuarray PROPERTIES add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) -if(CUDA_FOUND) - if (NCCL_FOUND) - target_link_libraries(gpuarray ${NCCL_LIBRARY}) - target_link_libraries(gpuarray-static ${NCCL_LIBRARY}) - endif() +if (NCCL_FOUND) + target_link_libraries(gpuarray ${NCCL_LIBRARY}) + target_link_libraries(gpuarray-static ${NCCL_LIBRARY}) endif() if(OpenCL_FOUND) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index e1c2e12200..c5b5c8905f 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -8,17 +8,13 @@ #include "private.h" -#ifdef WITH_CUDA extern const gpuarray_buffer_ops cuda_ops; -#endif #ifdef WITH_OPENCL extern const gpuarray_buffer_ops opencl_ops; #endif const gpuarray_buffer_ops *gpuarray_get_ops(const char *name) { -#ifdef WITH_CUDA if (strcmp("cuda", name) == 0) return &cuda_ops; -#endif #ifdef WITH_OPENCL if (strcmp("opencl", name) == 0) return &opencl_ops; #endif diff --git a/src/gpuarray_extension.c b/src/gpuarray_extension.c index 1904c3a8f4..1fde904395 100644 --- a/src/gpuarray_extension.c +++ b/src/gpuarray_extension.c @@ -7,7 +7,6 @@ typedef struct _ext { void *val; } ext; -#ifdef WITH_CUDA extern void cuda_enter(void); extern void cuda_exit(void); extern void *cuda_make_ctx(void); @@ -18,7 +17,6 @@ extern void *cuda_wait(void); extern void *cuda_record(void); extern void *cuda_get_ipc_handle(void); extern void *cuda_open_ipc_handle(void); -#endif #ifdef WITH_OPENCL extern void *cl_make_ctx(void); extern void *cl_get_stream(void); @@ -27,7 +25,6 @@ extern void *cl_get_buf(void); #endif static ext ext_list[] = { -#ifdef WITH_CUDA {"cuda_enter", cuda_enter}, {"cuda_exit", cuda_exit}, {"cuda_make_ctx", cuda_make_ctx}, @@ -38,7 +35,6 @@ static ext ext_list[] = { {"cuda_record", cuda_record}, {"cuda_get_ipc_handle", cuda_get_ipc_handle}, {"cuda_open_ipc_handle", cuda_open_ipc_handle}, -#endif #ifdef WITH_OPENCL {"cl_make_ctx", cl_make_ctx}, {"cl_get_stream", cl_get_stream}, From ee0bb7f129ba0f92165919b56db400b5f6305de6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Nov 2016 17:07:50 -0500 Subject: [PATCH 074/597] Make a loader for OpenCL, always enable OpenCL. --- src/CMakeLists.txt | 49 +++--- src/gpuarray_buffer.c | 4 - src/gpuarray_buffer_opencl.c | 78 ++++----- src/gpuarray_extension.c | 6 +- src/loaders/CMakeLists.txt | 1 + src/loaders/libopencl.c | 42 +++++ src/loaders/libopencl.fn | 31 ++++ src/loaders/libopencl.h | 323 +++++++++++++++++++++++++++++++++++ src/private_opencl.h | 6 +- 9 files changed, 459 insertions(+), 81 deletions(-) create mode 100644 src/loaders/libopencl.c create mode 100644 src/loaders/libopencl.fn create mode 100644 src/loaders/libopencl.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bf7b3c3bf0..9050eb97a6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,14 +8,13 @@ endif() find_package(CUDA REQUIRED) find_package(NCCL) -find_package(OpenCL) -if(OpenCL_FOUND) +find_package(OpenCL REQUIRED) find_package(clBLAS) if(NOT CLBLAS_FOUND) -find_package(CLBlast) -endif() + find_package(CLBlast) endif() + include_directories("${CMAKE_CURRENT_SOURCE_DIR}") add_custom_command( @@ -61,6 +60,7 @@ gpuarray_elemwise.c gpuarray_reduction.c gpuarray_buffer_cuda.c gpuarray_blas_cuda_cublas.c +gpuarray_buffer_opencl.c ) check_function_exists(strlcat HAVE_STRL) @@ -102,22 +102,18 @@ if(NCCL_FOUND) include_directories(${NCCL_INCLUDE_DIR}) endif() -if(OpenCL_FOUND) - list(APPEND _GPUARRAY_SRC gpuarray_buffer_opencl.c) - add_definitions(-DWITH_OPENCL) - include_directories(${OpenCL_INCLUDE_DIRS}) - - if(CLBLAS_FOUND) - message(STATUS "Building with CLBLAS") - list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblas.c) - add_definitions(-DWITH_OPENCL_CLBLAS) - include_directories(${CLBLAS_INCLUDE_DIRS}) - elseif(CLBLAS_FOUND) - message(STATUS "Building with CLBLAST") - list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblast.c) - add_definitions(-DWITH_OPENCL_CLBLAST) - include_directories(${CLBLAST_INCLUDE_DIRS}) - endif() +include_directories(${OpenCL_INCLUDE_DIRS}) + +if(CLBLAS_FOUND) + message(STATUS "Building with CLBLAS") + list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblas.c) + add_definitions(-DWITH_OPENCL_CLBLAS) + include_directories(${CLBLAS_INCLUDE_DIRS}) +elseif(CLBLAS_FOUND) + message(STATUS "Building with CLBLAST") + list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblast.c) + add_definitions(-DWITH_OPENCL_CLBLAST) + include_directories(${CLBLAST_INCLUDE_DIRS}) endif() configure_file( @@ -140,18 +136,17 @@ set_target_properties(gpuarray PROPERTIES add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) +target_link_libraries(gpuarray ${CMAKE_DL_LIBS}) +target_link_libraries(gpuarray-static ${CMAKE_DL_LIBS}) + if (NCCL_FOUND) target_link_libraries(gpuarray ${NCCL_LIBRARY}) target_link_libraries(gpuarray-static ${NCCL_LIBRARY}) endif() -if(OpenCL_FOUND) - target_link_libraries(gpuarray ${OpenCL_LIBRARIES}) - target_link_libraries(gpuarray-static ${OpenCL_LIBRARIES}) - if (CLBLAS_FOUND) - target_link_libraries(gpuarray ${CLBLAS_LIBRARIES}) - target_link_libraries(gpuarray-static ${CLBLAS_LIBRARIES}) - endif() +if (CLBLAS_FOUND) + target_link_libraries(gpuarray ${CLBLAS_LIBRARIES}) + target_link_libraries(gpuarray-static ${CLBLAS_LIBRARIES}) endif() set(headers diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index c5b5c8905f..14f792e453 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -9,15 +9,11 @@ #include "private.h" extern const gpuarray_buffer_ops cuda_ops; -#ifdef WITH_OPENCL extern const gpuarray_buffer_ops opencl_ops; -#endif const gpuarray_buffer_ops *gpuarray_get_ops(const char *name) { if (strcmp("cuda", name) == 0) return &cuda_ops; -#ifdef WITH_OPENCL if (strcmp("opencl", name) == 0) return &opencl_ops; -#endif return NULL; } diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index f48909e663..f7e2d4d8ab 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -40,8 +40,19 @@ static gpukernel *cl_newkernel(gpucontext *ctx, unsigned int count, static const char CL_CONTEXT_PREAMBLE[] = "#define GA_WARP_SIZE %lu\n"; // to be filled by cl_make_ctx() -static inline int cl_get_platform_count(unsigned int* platcount) { +static int setup_done = 0; +static int setup_lib(void) { + if (setup_done) + return GA_NO_ERROR; + GA_CHECK(load_libopencl()); + setup_done = 1; + return GA_NO_ERROR; +} + +static int cl_get_platform_count(unsigned int* platcount) { cl_uint nump; + + GA_CHECK(setup_lib()); err = clGetPlatformIDs(0, NULL, &nump); if (err != CL_SUCCESS) return GA_IMPL_ERROR; @@ -54,6 +65,8 @@ static int cl_get_device_count(unsigned int platform, unsigned int* devcount) { cl_platform_id p; cl_uint numd; unsigned int platcount; + + /* This will load the library if needed */ GA_CHECK(cl_get_platform_count(&platcount)); ps = calloc(sizeof(*ps), platcount); @@ -110,21 +123,24 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { const char *rlk[1]; gpukernel *m; + e = setup_lib(); + if (e != GA_NO_ERROR) + return NULL; id = get_dev(ctx, NULL); if (id == NULL) return NULL; err = clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(qprop), - &qprop, NULL); + &qprop, NULL); if (err != CL_SUCCESS) return NULL; err = clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL); if (err != CL_SUCCESS) return NULL; err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, - NULL); + NULL); if (err != CL_SUCCESS) return NULL; err = clGetDeviceInfo(id, CL_DRIVER_VERSION, sizeof(driver_version), - driver_version, NULL); + driver_version, NULL); if (err != CL_SUCCESS) return NULL; @@ -223,7 +239,7 @@ gpudata *cl_make_buf(gpucontext *c, cl_mem buf) { ASSERT_CTX(ctx); ctx->err = clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx), - &buf_ctx, NULL); + &buf_ctx, NULL); if (ctx->err != CL_SUCCESS) return NULL; if (buf_ctx != ctx->ctx) return NULL; @@ -319,10 +335,8 @@ static const char *get_error_string(cl_int err) { case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; case CL_MAP_FAILURE: return "Map failure"; -#ifdef CL_VERSION_1_1 case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "Buffer offset improperly aligned"; case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Event in wait list has an error status"; -#endif case CL_INVALID_VALUE: return "Invalid value"; case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; case CL_INVALID_PLATFORM: return "Invalid platform"; @@ -357,9 +371,7 @@ static const char *get_error_string(cl_int err) { case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; case CL_INVALID_GLOBAL_WORK_SIZE: return "Invalid global work size"; -#ifdef CL_VERSION_1_1 case CL_INVALID_PROPERTY: return "Invalid property"; -#endif default: return "Unknown error"; } } @@ -398,7 +410,6 @@ errcb(const char *errinfo, const void *pi, size_t cb, void *u) { } static gpucontext *cl_init(int devno, int flags, int *ret) { - int platno; cl_device_id *ds; cl_device_id d; cl_platform_id *ps; @@ -410,10 +421,16 @@ static gpucontext *cl_init(int devno, int flags, int *ret) { }; cl_context ctx; cl_ctx *res; + int platno; + int e; platno = devno >> 16; devno &= 0xFFFF; + e = setup_lib(); + if (e != GA_NO_ERROR) + return NULL; + err = clGetPlatformIDs(0, NULL, &nump); CHKFAIL(NULL); @@ -528,27 +545,23 @@ static void cl_release(gpudata *b) { } static int cl_share(gpudata *a, gpudata *b, int *ret) { -#ifdef CL_VERSION_1_1 cl_ctx *ctx; cl_mem aa, bb; -#endif ASSERT_BUF(a); ASSERT_BUF(b); if (a->buf == b->buf) return 1; -#ifdef CL_VERSION_1_1 if (a->ctx != b->ctx) return 0; ctx = a->ctx; ASSERT_CTX(ctx); ctx->err = clGetMemObjectInfo(a->buf, CL_MEM_ASSOCIATED_MEMOBJECT, - sizeof(aa), &aa, NULL); + sizeof(aa), &aa, NULL); CHKFAIL(-1); ctx->err = clGetMemObjectInfo(b->buf, CL_MEM_ASSOCIATED_MEMOBJECT, - sizeof(bb), &bb, NULL); + sizeof(bb), &bb, NULL); CHKFAIL(-1); if (aa == NULL) aa = a->buf; if (bb == NULL) bb = b->buf; if (aa == bb) return 1; -#endif return 0; } @@ -579,7 +592,7 @@ static int cl_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, evl = evw; ctx->err = clEnqueueCopyBuffer(ctx->q, src->buf, dst->buf, srcoff, dstoff, - sz, num_ev, evl, &ev); + sz, num_ev, evl, &ev); if (ctx->err != CL_SUCCESS) { return GA_IMPL_ERROR; } @@ -613,7 +626,7 @@ static int cl_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { } ctx->err = clEnqueueReadBuffer(ctx->q, src->buf, CL_TRUE, srcoff, sz, dst, - num_ev, evl, NULL); + num_ev, evl, NULL); if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; if (src->ev != NULL) clReleaseEvent(src->ev); src->ev = NULL; @@ -639,7 +652,7 @@ static int cl_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { } ctx->err = clEnqueueWriteBuffer(ctx->q, dst->buf, CL_TRUE, dstoff, sz, src, - num_ev, evl, NULL); + num_ev, evl, NULL); if (err != CL_SUCCESS) return GA_IMPL_ERROR; if (dst->ev != NULL) clReleaseEvent(dst->ev); dst->ev = NULL; @@ -671,7 +684,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { if (fl & CL_MEM_READ_ONLY) return GA_READONLY_ERROR; ctx->err = clGetMemObjectInfo(dst->buf, CL_MEM_SIZE, sizeof(bytes), &bytes, - NULL); + NULL); if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; bytes -= offset; @@ -811,7 +824,6 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, FAIL(NULL, GA_VALUE_ERROR); p = clCreateProgramWithBinary(ctx->ctx, 1, &dev, lengths, (const unsigned char **)strings, NULL, &ctx->err); if (ctx->err != CL_SUCCESS) { - clReleaseProgram(p); FAIL(NULL, GA_IMPL_ERROR); } } else { @@ -862,7 +874,7 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, // Determine the size of the log clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - if(strb_ensure(&debug_msg, log_size)!=-1 && log_size>=1) { // Checks strb has enough space + if (strb_ensure(&debug_msg, log_size)!=-1 && log_size>=1) { // Checks strb has enough space // Get the log directly into the debug_msg clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size, debug_msg.s+debug_msg.l, NULL); debug_msg.l += (log_size-1); // Back off to before final '\0' @@ -1042,7 +1054,7 @@ static int cl_callkernel(gpukernel *k, unsigned int n, _gs[0] = gs[0] * ls[0]; } ctx->err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls, - num_ev, evw, &ev); + num_ev, evw, &ev); free(evw); if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; @@ -1419,27 +1431,11 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, &id, NULL); if (ctx->err != GA_NO_ERROR) return GA_IMPL_ERROR; -#ifdef CL_VERSION_1_1 ctx->err = clGetKernelWorkGroupInfo(k->k, id, - CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, - sizeof(sz), &sz, NULL); + CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, + sizeof(sz), &sz, NULL); if (ctx->err != GA_NO_ERROR) return GA_IMPL_ERROR; -#else - ctx->err = clGetKernelWorkGroupInfo(k->k, id, CL_KERNEL_WORK_GROUP_SIZE, - sizeof(sz), &sz, NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; - /* - This is sort of a guess, AMD generally has 64 and NVIDIA has 32. - Since this is a multiple, it would not hurt a lot to overestimate - unless we go over the maximum. However underestimating may hurt - performance due to the way we do the automatic allocation. - - Also OpenCL 1.0 kind of sucks and this is only used for that. - */ - sz = (sz < 64) ? sz : 64; -#endif *((size_t *)res) = sz; return GA_NO_ERROR; diff --git a/src/gpuarray_extension.c b/src/gpuarray_extension.c index 1fde904395..e120d83b88 100644 --- a/src/gpuarray_extension.c +++ b/src/gpuarray_extension.c @@ -17,12 +17,11 @@ extern void *cuda_wait(void); extern void *cuda_record(void); extern void *cuda_get_ipc_handle(void); extern void *cuda_open_ipc_handle(void); -#ifdef WITH_OPENCL + extern void *cl_make_ctx(void); extern void *cl_get_stream(void); extern void *cl_make_buf(void); extern void *cl_get_buf(void); -#endif static ext ext_list[] = { {"cuda_enter", cuda_enter}, @@ -35,12 +34,11 @@ static ext ext_list[] = { {"cuda_record", cuda_record}, {"cuda_get_ipc_handle", cuda_get_ipc_handle}, {"cuda_open_ipc_handle", cuda_open_ipc_handle}, -#ifdef WITH_OPENCL + {"cl_make_ctx", cl_make_ctx}, {"cl_get_stream", cl_get_stream}, {"cl_make_buf", cl_make_buf}, {"cl_get_buf", cl_get_buf}, -#endif }; #define N_EXT (sizeof(ext_list)/sizeof(ext_list[0])) diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt index 58b7ddca0c..86cc9ed7ea 100644 --- a/src/loaders/CMakeLists.txt +++ b/src/loaders/CMakeLists.txt @@ -3,4 +3,5 @@ dyn_load.c libcuda.c libnvrtc.c libcublas.c +libopencl.c ) diff --git a/src/loaders/libopencl.c b/src/loaders/libopencl.c new file mode 100644 index 0000000000..ce0e067de4 --- /dev/null +++ b/src/loaders/libopencl.c @@ -0,0 +1,42 @@ +#include + +#include "libopencl.h" +#include "dyn_load.h" +#include "gpuarray/error.h" +/* This code is strongly inspired from the dynamic loading code in the + * samples */ +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +static char libname[] = "OpenCL.dll"; +#else /* Unix */ +static char libname[] = "libOpenCL.so"; +#endif + +#define DEF_PROC(ret, name, args) t##name *name + +#include "libopencl.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +static int loaded = 0; + +int load_libopencl(void) { + void *lib; + + if (loaded) + return GA_NO_ERROR; + + lib = ga_load_library(libname); + if (lib == NULL) + return GA_LOAD_ERROR; + + #include "libopencl.fn" + + loaded = 1; + return GA_NO_ERROR; +} diff --git a/src/loaders/libopencl.fn b/src/loaders/libopencl.fn new file mode 100644 index 0000000000..03293ac502 --- /dev/null +++ b/src/loaders/libopencl.fn @@ -0,0 +1,31 @@ +DEF_PROC(cl_context, clCreateContext, (const cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *)); +DEF_PROC(cl_int, clBuildProgram, (cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *)); +DEF_PROC(cl_mem, clCreateBuffer, (cl_context, cl_mem_flags, size_t, void *, cl_int *)); +DEF_PROC(cl_command_queue, clCreateCommandQueue, (cl_context, cl_device_id, cl_command_queue_properties, cl_int *)); +DEF_PROC(cl_kernel, clCreateKernel, (cl_program, const char *, cl_int *)); +DEF_PROC(cl_program, clCreateProgramWithBinary, (cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *)); +DEF_PROC(cl_program, clCreateProgramWithSource, (cl_context, cl_uint, const char **, const size_t *, cl_int *)); +DEF_PROC(cl_int, clEnqueueReadBuffer, (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *)); +DEF_PROC(cl_int, clEnqueueWriteBuffer, (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *)); +DEF_PROC(cl_int, clEnqueueCopyBuffer, (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *)); +DEF_PROC(cl_int, clEnqueueNDRangeKernel, (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *)); +DEF_PROC(cl_int, clGetContextInfo, (cl_context, cl_context_info, size_t, void *, size_t *)); +DEF_PROC(cl_int, clGetDeviceIDs, (cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *)); +DEF_PROC(cl_int, clGetDeviceInfo, (cl_device_id, cl_device_info, size_t, void *, size_t *)); +DEF_PROC(cl_int, clGetKernelInfo, (cl_kernel, cl_kernel_info, size_t, void *, size_t *)); +DEF_PROC(cl_int, clGetKernelWorkGroupInfo, (cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *)); +DEF_PROC(cl_int, clGetMemObjectInfo, (cl_mem, cl_mem_info, size_t, void *, size_t *)); +DEF_PROC(cl_int, clGetPlatformIDs, (cl_uint, cl_platform_id *, cl_uint *)); +DEF_PROC(cl_int, clGetProgramBuildInfo, (cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *)); +DEF_PROC(cl_int, clGetProgramInfo, (cl_program, cl_program_info, size_t, void *, size_t *)); +DEF_PROC(cl_int, clReleaseCommandQueue, (cl_command_queue)); +DEF_PROC(cl_int, clReleaseContext, (cl_context)); +DEF_PROC(cl_int, clReleaseEvent, (cl_event)); +DEF_PROC(cl_int, clReleaseKernel, (cl_kernel)); +DEF_PROC(cl_int, clReleaseMemObject, (cl_mem)); +DEF_PROC(cl_int, clReleaseProgram, (cl_program)); +DEF_PROC(cl_int, clRetainContext, (cl_context)); +DEF_PROC(cl_int, clRetainEvent, (cl_event)); +DEF_PROC(cl_int, clRetainMemObject, (cl_mem)); +DEF_PROC(cl_int, clSetKernelArg, (cl_kernel, cl_uint, size_t, const void *)); +DEF_PROC(cl_int, clWaitForEvents, (cl_uint, const cl_event *)); \ No newline at end of file diff --git a/src/loaders/libopencl.h b/src/loaders/libopencl.h new file mode 100644 index 0000000000..8aacb77af6 --- /dev/null +++ b/src/loaders/libopencl.h @@ -0,0 +1,323 @@ +#ifndef LOADER_LIBOPENCL_H +#define LOADER_LIBOPENCL_H + +#if defined(_WIN32) + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_CALL + #define CL_CALLBACK +#endif + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; + +#if (defined (_WIN32) && defined(_MSC_VER)) +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; +typedef float cl_float; +typedef double cl_double; +#else +#include +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); +#endif + +typedef cl_uint cl_bool; +typedef cl_ulong cl_bitfield; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_type; +typedef cl_bitfield cl_command_queue_properties; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_uint cl_mem_info; +typedef cl_bitfield cl_mem_flags; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_work_group_info; + +int load_libopencl(void); + +#define DEF_PROC(ret, name, args) typedef ret CL_API_CALL t##name args + +#include "libopencl.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) extern t##name *name + +#include "libopencl.fn" + +#undef DEF_PROC + +/* What follows is a bunch of defines from the official OpenCL spec. + * This allows us to build even if there are no OpenCL implementation + * present. */ + +/* Error codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 +#define CL_INVALID_PIPE_SIZE -69 +#define CL_INVALID_DEVICE_QUEUE -70 + +#define CL_FALSE 0 +#define CL_TRUE 1 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#define CL_CONTEXT_NUM_DEVICES 0x1083 + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ +#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B +#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C +#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D +#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E +#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F +#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 +#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 +#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 +#define CL_DEVICE_SVM_CAPABILITIES 0x1053 +#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 +#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 +#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 +#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 +#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 +#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 +#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A +#define CL_DEVICE_IL_VERSION 0x105B +#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C +#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) +#define CL_QUEUE_ON_DEVICE (1 << 2) +#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 +#define CL_MEM_USES_SVM_POINTER 0x1109 + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 + +/* cl_mem_flags and cl_svm_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +/* reserved (1 << 6) */ +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) +#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ +#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ +#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#define CL_PROGRAM_BINARY_TYPE 0x1184 +#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#define CL_KERNEL_ATTRIBUTES 0x1195 +#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 +#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 +#define CL_PROGRAM_IL 0x1169 + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 + +#endif diff --git a/src/private_opencl.h b/src/private_opencl.h index 34ae92906d..e40242d57e 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -3,11 +3,7 @@ #include "private.h" -#ifdef __APPLE__ -#include -#else -#include -#endif +#include "loaders/libopencl.h" #ifdef DEBUG #include From b2505428b1942407caa624d7f7a8c170318ef943 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 11 Nov 2016 17:13:12 -0500 Subject: [PATCH 075/597] Add loader for nccl --- src/CMakeLists.txt | 56 +--------------------------- src/gpuarray_buffer_cuda.c | 7 ---- src/gpuarray_collectives_cuda_nccl.c | 20 +++++++++- src/loaders/CMakeLists.txt | 1 + src/loaders/libnccl.c | 41 ++++++++++++++++++++ src/loaders/libnccl.fn | 12 ++++++ src/loaders/libnccl.h | 42 +++++++++++++++++++++ 7 files changed, 117 insertions(+), 62 deletions(-) create mode 100644 src/loaders/libnccl.c create mode 100644 src/loaders/libnccl.fn create mode 100644 src/loaders/libnccl.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9050eb97a6..cb802ad675 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,16 +5,6 @@ if(CMAKE_COMPILER_IS_GNUCC) add_definitions(-Wdeclaration-after-statement) endif() -find_package(CUDA REQUIRED) -find_package(NCCL) - -find_package(OpenCL REQUIRED) -find_package(clBLAS) -if(NOT CLBLAS_FOUND) - find_package(CLBlast) -endif() - - include_directories("${CMAKE_CURRENT_SOURCE_DIR}") add_custom_command( @@ -60,6 +50,7 @@ gpuarray_elemwise.c gpuarray_reduction.c gpuarray_buffer_cuda.c gpuarray_blas_cuda_cublas.c +gpuarray_collectives_cuda_nccl.c gpuarray_buffer_opencl.c ) @@ -78,44 +69,6 @@ if(NOT HAVE_MKSTEMP) list(APPEND _GPUARRAY_SRC gpuarray_mkstemp.c) endif() -if (CUDA_VERSION_MAJOR LESS 7) - message( FATAL_ERROR "This package requires CUDA 7.0 or more. Found version ${CUDA_VERSION_STRING}") -endif() - -if (APPLE) - FIND_PATH(CUDADRV_INCLUDE CUDA/cuda.h) - # this is somewhat a hack, but otherwise cublas_v2.h isn't found - set(CUDADRV_INCLUDE ${CUDADRV_INCLUDE} ${CUDA_TOOLKIT_INCLUDE}) -endif() -if(NOT CUDADRV_INCLUDE) - set(CUDADRV_INCLUDE ${CUDA_TOOLKIT_INCLUDE}) -endif() - -include_directories(${CUDADRV_INCLUDE}) - - -if(NCCL_FOUND) - message(STATUS "Building with NCCL") - set(BUILD_WITH_COLLECTIVES 1 PARENT_SCOPE) - add_definitions(-DWITH_CUDA_NCCL) - list(APPEND _GPUARRAY_SRC gpuarray_collectives_cuda_nccl.c) - include_directories(${NCCL_INCLUDE_DIR}) -endif() - -include_directories(${OpenCL_INCLUDE_DIRS}) - -if(CLBLAS_FOUND) - message(STATUS "Building with CLBLAS") - list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblas.c) - add_definitions(-DWITH_OPENCL_CLBLAS) - include_directories(${CLBLAS_INCLUDE_DIRS}) -elseif(CLBLAS_FOUND) - message(STATUS "Building with CLBLAST") - list(APPEND _GPUARRAY_SRC gpuarray_blas_opencl_clblast.c) - add_definitions(-DWITH_OPENCL_CLBLAST) - include_directories(${CLBLAST_INCLUDE_DIRS}) -endif() - configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/private_config.h.in ${CMAKE_CURRENT_SOURCE_DIR}/private_config.h @@ -139,12 +92,7 @@ add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) target_link_libraries(gpuarray ${CMAKE_DL_LIBS}) target_link_libraries(gpuarray-static ${CMAKE_DL_LIBS}) -if (NCCL_FOUND) - target_link_libraries(gpuarray ${NCCL_LIBRARY}) - target_link_libraries(gpuarray-static ${NCCL_LIBRARY}) -endif() - -if (CLBLAS_FOUND) +if (BUILD_WITH_CLBLAS) target_link_libraries(gpuarray ${CLBLAS_LIBRARIES}) target_link_libraries(gpuarray-static ${CLBLAS_LIBRARIES}) endif() diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index d2354f831b..aaf2c66ce1 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1298,9 +1298,7 @@ static int cuda_transfer(gpudata *dst, size_t dstoff, } extern gpuarray_blas_ops cublas_ops; -#ifdef WITH_CUDA_NCCL extern gpuarray_comm_ops nccl_ops; -#endif // WITH_CUDA_NCCL static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { @@ -1453,13 +1451,8 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_COMM_OPS: -#ifdef WITH_CUDA_NCCL *((gpuarray_comm_ops**)res) = &nccl_ops; return GA_NO_ERROR; -#else - *((void**) res) = NULL; - return GA_DEVSUP_ERROR; -#endif // WITH_CUDA_NCCL case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index 3a15156c82..a0f6d12060 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -2,7 +2,7 @@ #include #include -#include +#include "loaders/libnccl.h" #include "gpuarray/buffer_collectives.h" #include "gpuarray/config.h" @@ -59,6 +59,19 @@ struct _gpucomm { #endif }; +static int setup_done = 0; + +static int setup_lib(void) { + int err; + if (setup_done) + return GA_NO_ERROR; + err = load_libnccl(); + if (err != GA_NO_ERROR) + return err; + setup_done = 1; + return GA_NO_ERROR; +} + /** * \brief Helper function to dereference a `comm`'s context and free memory */ @@ -77,6 +90,9 @@ static int comm_new(gpucomm** comm_ptr, gpucontext* ctx, ncclResult_t nccl_err; ASSERT_CTX(ctx); + + GA_CHECK(setup_lib()); + comm = calloc(1, sizeof(*comm)); // Allocate memory if (comm == NULL) { *comm_ptr = NULL; // Set to NULL if failed @@ -115,6 +131,8 @@ static void comm_free(gpucomm* comm) { */ static int generate_clique_id(gpucontext* c, gpucommCliqueId* comm_id) { ASSERT_CTX(c); + + GA_CHECK(setup_lib()); NCCL_CHKFAIL(c, ncclGetUniqueId((ncclUniqueId*)comm_id)); } diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt index 86cc9ed7ea..5f6aa9231b 100644 --- a/src/loaders/CMakeLists.txt +++ b/src/loaders/CMakeLists.txt @@ -3,5 +3,6 @@ dyn_load.c libcuda.c libnvrtc.c libcublas.c +libnccl.c libopencl.c ) diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c new file mode 100644 index 0000000000..0b68759844 --- /dev/null +++ b/src/loaders/libnccl.c @@ -0,0 +1,41 @@ +#include + +#include "libnccl.h" +#include "dyn_load.h" +#include "gpuarray/error.h" + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +static const char libname[] = "nccl.dll"; +#else /* Unix */ +static const char libname[] = "libnccl.so"; +#endif + +#define DEF_PROC(ret, name, args) t##name *name + +#include "libnccl.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +static int loaded = 0; + +int load_libnccl(void) { + void *lib; + + if (loaded) + return GA_NO_ERROR; + + lib = ga_load_library(libname); + if (lib == NULL) + return GA_LOAD_ERROR; + + #include "libnccl.fn" + + loaded = 1; + return GA_NO_ERROR; +} diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn new file mode 100644 index 0000000000..64de5dd88e --- /dev/null +++ b/src/loaders/libnccl.fn @@ -0,0 +1,12 @@ +DEF_PROC(ncclResult_t, ncclGetUniqueId, (ncclUniqueId* uniqueId)); +DEF_PROC(ncclResult_t, ncclCommInitRank, (ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank)); +DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm)); +DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count)); +DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank)); +DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result)); +DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, +cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h new file mode 100644 index 0000000000..d8aac387e3 --- /dev/null +++ b/src/loaders/libnccl.h @@ -0,0 +1,42 @@ +#ifndef LOADER_LIBNCCL_H +#define LOADER_LIBNCCL_H + +typedef struct CUstream_st *cudaStream_t; +typedef struct ncclComm* ncclComm_t; + +#define NCCL_UNIQUE_ID_BYTES 128 +typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; + +typedef enum { ncclSuccess = 0 } ncclResult_t; + +typedef enum { ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + nccl_NUM_OPS = 4 } ncclRedOp_t; + +/* Data types */ +typedef enum { ncclChar = 0, + ncclInt = 1, + ncclHalf = 2, + ncclFloat = 3, + ncclDouble = 4, + ncclInt64 = 5, + ncclUint64 = 6, + nccl_NUM_TYPES = 7 } ncclDataType_t; + +int load_libnccl(void); + +#define DEF_PROC(ret, name, args) typedef ret t##name args + +#include "libnccl.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) extern t##name *name + +#include "libnccl.fn" + +#undef DEF_PROC + +#endif From a642e33900c956a01746d29b644290067db43b62 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 11 Nov 2016 17:14:07 -0500 Subject: [PATCH 076/597] Remove/adjust comments. --- src/loaders/libcublas.c | 2 -- src/loaders/libcuda.c | 4 ++-- src/loaders/libnvrtc.c | 3 --- src/loaders/libopencl.c | 3 +-- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index afa81f440f..dedfd20927 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -3,8 +3,6 @@ #include "libcublas.h" #include "dyn_load.h" #include "gpuarray/error.h" -/* This code is strongly inspired from the dynamic loading code in the - * samples */ #define DEF_PROC(name, args) t##name *name #define DEF_PROC_V2(name, args) DEF_PROC(name, args) diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 6cf7798c20..ab8d149c89 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -3,8 +3,8 @@ #include "libcuda.h" #include "dyn_load.h" #include "gpuarray/error.h" -/* This code is strongly inspired from the dynamic loading code in the - * samples */ + +/* This code is inspired from the dynamic loading code in the samples */ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static char libname[] = "nvcuda.dll"; #else /* Unix */ diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c index 5fdf24f4e2..89e14cad09 100644 --- a/src/loaders/libnvrtc.c +++ b/src/loaders/libnvrtc.c @@ -5,9 +5,6 @@ #include "dyn_load.h" #include "gpuarray/error.h" -/* This code is strongly inspired from the dynamic loading code in the - * samples */ - #define DEF_PROC(name, args) t##name *name #include "libnvrtc.fn" diff --git a/src/loaders/libopencl.c b/src/loaders/libopencl.c index ce0e067de4..ec1f89614a 100644 --- a/src/loaders/libopencl.c +++ b/src/loaders/libopencl.c @@ -3,8 +3,7 @@ #include "libopencl.h" #include "dyn_load.h" #include "gpuarray/error.h" -/* This code is strongly inspired from the dynamic loading code in the - * samples */ + #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static char libname[] = "OpenCL.dll"; #else /* Unix */ From ea379bf4113f3ffbb76bb56cdf910bce1e213596 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 11 Nov 2016 17:50:07 -0500 Subject: [PATCH 077/597] Add loader for clBLAS. --- src/CMakeLists.txt | 6 +---- src/gpuarray_blas_opencl_clblas.c | 4 ++- src/gpuarray_buffer_opencl.c | 9 ------- src/loaders/CMakeLists.txt | 1 + src/loaders/libclblas.c | 41 +++++++++++++++++++++++++++++++ src/loaders/libclblas.fn | 8 ++++++ src/loaders/libclblas.h | 36 +++++++++++++++++++++++++++ 7 files changed, 90 insertions(+), 15 deletions(-) create mode 100644 src/loaders/libclblas.c create mode 100644 src/loaders/libclblas.fn create mode 100644 src/loaders/libclblas.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cb802ad675..a50ee0d477 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -52,6 +52,7 @@ gpuarray_buffer_cuda.c gpuarray_blas_cuda_cublas.c gpuarray_collectives_cuda_nccl.c gpuarray_buffer_opencl.c +gpuarray_blas_opencl_clblas.c ) check_function_exists(strlcat HAVE_STRL) @@ -92,11 +93,6 @@ add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) target_link_libraries(gpuarray ${CMAKE_DL_LIBS}) target_link_libraries(gpuarray-static ${CMAKE_DL_LIBS}) -if (BUILD_WITH_CLBLAS) - target_link_libraries(gpuarray ${CLBLAS_LIBRARIES}) - target_link_libraries(gpuarray-static ${CLBLAS_LIBRARIES}) -endif() - set(headers gpuarray/array.h gpuarray/blas.h diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 0c13a70fef..ecd69e1bc2 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -1,7 +1,7 @@ #include "private.h" #include "private_opencl.h" -#include +#include "loaders/libclblas.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" @@ -36,6 +36,8 @@ static int setup(gpucontext *ctx) { clblasStatus err; if (refcnt == 0) { + GA_CHECK(load_libclblas()); + err = clblasSetup(); if (err != clblasSuccess) return GA_BLAS_ERROR; diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index f7e2d4d8ab..c1a6f16743 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1125,13 +1125,10 @@ static int cl_transfer(gpudata *dst, size_t dstoff, return GA_UNSUPPORTED_ERROR; } -#ifdef WITH_OPENCL_CLBLAS extern gpuarray_blas_ops clblas_ops; -#else #ifdef WITH_OPENCL_CLBLAST extern gpuarray_blas_ops clblast_ops; #endif -#endif static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { @@ -1258,17 +1255,11 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: -#ifdef WITH_OPENCL_CLBLAS *((gpuarray_blas_ops **)res) = &clblas_ops; return GA_NO_ERROR; -#else #ifdef WITH_OPENCL_CLBLAST *((gpuarray_blas_ops **)res) = &clblast_ops; return GA_NO_ERROR; -#else - *((void **)res) = NULL; - return GA_DEVSUP_ERROR; -#endif #endif case GA_CTX_PROP_COMM_OPS: diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt index 5f6aa9231b..de6c0fec76 100644 --- a/src/loaders/CMakeLists.txt +++ b/src/loaders/CMakeLists.txt @@ -5,4 +5,5 @@ libnvrtc.c libcublas.c libnccl.c libopencl.c +libclblas.c ) diff --git a/src/loaders/libclblas.c b/src/loaders/libclblas.c new file mode 100644 index 0000000000..00b0ead5a4 --- /dev/null +++ b/src/loaders/libclblas.c @@ -0,0 +1,41 @@ +#include + +#include "libclblas.h" +#include "dyn_load.h" +#include "gpuarray/error.h" + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +static const char libname[] = "clBLAS.dll"; +#else /* Unix */ +static const char libname[] = "libclBLAS.so"; +#endif + +#define DEF_PROC(ret, name, args) t##name *name + +#include "libclblas.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +static int loaded = 0; + +int load_libclblas(void) { + void *lib; + + if (loaded) + return GA_NO_ERROR; + + lib = ga_load_library(libname); + if (lib == NULL) + return GA_LOAD_ERROR; + + #include "libclblas.fn" + + loaded = 1; + return GA_NO_ERROR; +} diff --git a/src/loaders/libclblas.fn b/src/loaders/libclblas.fn new file mode 100644 index 0000000000..2ab7f5b2af --- /dev/null +++ b/src/loaders/libclblas.fn @@ -0,0 +1,8 @@ +DEF_PROC(clblasStatus, clblasSetup, (void)); +DEF_PROC(void, clblasTeardown, (void)); +DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); \ No newline at end of file diff --git a/src/loaders/libclblas.h b/src/loaders/libclblas.h new file mode 100644 index 0000000000..44a7cb7911 --- /dev/null +++ b/src/loaders/libclblas.h @@ -0,0 +1,36 @@ +#ifndef LOADER_LIBCLBLAS_H +#define LOADER_LIBCLBLAS_H + +#include "libopencl.h" + +typedef enum clblasOrder_ { + clblasRowMajor, + clblasColumnMajor +} clblasOrder; + +typedef enum clblasTranspose_ { + clblasNoTrans, + clblasTrans, + clblasConjTrans +} clblasTranspose; + +typedef enum clblasStatus_ { + clblasSuccess = CL_SUCCESS, + /* Rest is not exposed from here */ +} clblasStatus; + +int load_libclblas(void); + +#define DEF_PROC(ret, name, args) typedef ret t##name args + +#include "libclblas.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) extern t##name *name + +#include "libclblas.fn" + +#undef DEF_PROC + +#endif From 02563bd14acaeb414514821a31af3b32c508b92d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 11 Nov 2016 18:41:58 -0500 Subject: [PATCH 078/597] Adjust the tests to match changes in the build system. --- tests/CMakeLists.txt | 113 +++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 58 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ef772752de..2f3eb801cf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,3 +1,4 @@ +include(CheckSymbolExists) find_package(PkgConfig) pkg_search_module(CHECK check) @@ -72,64 +73,60 @@ add_executable(check_buffer main.c device.c check_buffer.c) target_link_libraries(check_buffer ${CHECK_LIBRARIES} gpuarray) add_test(test_buffer "${CMAKE_CURRENT_BINARY_DIR}/check_buffer") -if(BUILD_WITH_COLLECTIVES) - - find_package(MPI) - - if(MPI_C_FOUND) - - add_executable(check_buffer_collectives - main.c device.c communicator.c check_buffer_collectives.c - ) - target_link_libraries(check_buffer_collectives - ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray - ) - target_include_directories(check_buffer_collectives - PRIVATE ${MPI_C_INCLUDE_PATH} - ) - - add_executable(check_collectives - main.c device.c communicator.c check_collectives.c - ) - target_link_libraries(check_collectives - ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray - ) - target_include_directories(check_collectives - PRIVATE ${MPI_C_INCLUDE_PATH} - ) - - set_target_properties(check_buffer_collectives check_collectives PROPERTIES - COMPILE_DEFINITIONS TEST_COLLECTIVES - COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}" - LINK_FLAGS "${MPI_C_LINK_FLAGS}" - ) - - set(_NUM_DEVS $ENV{NUM_DEVS}) - if(NOT _NUM_DEVS) - set(_NUM_DEVS 1) - endif() - - set(_DEV_NAMES $ENV{DEV_NAMES}) - if(NOT _DEV_NAMES) - set(_DEV_NAMES "cuda") - endif() - separate_arguments(_DEV_NAMES) - - add_test(NAME test_buffer_collectives - COMMAND "${MPIEXEC}" ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS} - "${CMAKE_CURRENT_BINARY_DIR}/check_buffer_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES}) - add_test(NAME test_collectives - COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS} - "${CMAKE_CURRENT_BINARY_DIR}/check_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES}) - - else(MPI_C_FOUND) - - message(WARNING "Cannot find MPI") - message(WARNING "Checks on collectives and buffer_collectives will not be built or performed.") - - endif(MPI_C_FOUND) - -endif(BUILD_WITH_COLLECTIVES) +find_package(MPI) + +if (MPI_C_FOUND) + + add_executable(check_buffer_collectives + main.c device.c communicator.c check_buffer_collectives.c + ) + target_link_libraries(check_buffer_collectives + ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray + ) + target_include_directories(check_buffer_collectives + PRIVATE ${MPI_C_INCLUDE_PATH} + ) + + add_executable(check_collectives + main.c device.c communicator.c check_collectives.c + ) + target_link_libraries(check_collectives + ${CHECK_LIBRARIES} ${MPI_C_LIBRARIES} gpuarray + ) + target_include_directories(check_collectives + PRIVATE ${MPI_C_INCLUDE_PATH} + ) + + set_target_properties(check_buffer_collectives check_collectives PROPERTIES + COMPILE_DEFINITIONS TEST_COLLECTIVES + COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}" + LINK_FLAGS "${MPI_C_LINK_FLAGS}" + ) + + set(_NUM_DEVS $ENV{NUM_DEVS}) + if(NOT _NUM_DEVS) + set(_NUM_DEVS 1) + endif() + + set(_DEV_NAMES $ENV{DEV_NAMES}) + if(NOT _DEV_NAMES) + set(_DEV_NAMES "cuda") + endif() + separate_arguments(_DEV_NAMES) + + add_test(NAME test_buffer_collectives + COMMAND "${MPIEXEC}" ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS} + "${CMAKE_CURRENT_BINARY_DIR}/check_buffer_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES}) + add_test(NAME test_collectives + COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${_NUM_DEVS} ${MPIEXEC_PREFLAGS} + "${CMAKE_CURRENT_BINARY_DIR}/check_collectives" ${MPIEXEC_POSTFLAGS} ${_DEV_NAMES}) + +else() + + message(WARNING "Cannot find MPI") + message(WARNING "Checks on collectives and buffer_collectives will not be built or performed.") + +endif() ELSE(CHECK_FOUND) From 2782dd00205168fc27e8c7716113e373a8900ac5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 14:08:53 -0500 Subject: [PATCH 079/597] Add an implementation of float_to_half. --- src/private.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/private.h b/src/private.h index 064dc7a04e..730549e841 100644 --- a/src/private.h +++ b/src/private.h @@ -264,6 +264,53 @@ GPUARRAY_LOCAL void gpukernel_source_with_line_numbers(unsigned int count, size_t *newl, strb *src); +static uint16_t float_to_half(float value) { + static const int shift = 13; + static const int shiftSign = 16; + + static const int32_t infN = 0x7F800000; // flt32 infinity + static const int32_t maxN = 0x477FE000; // max flt16 normal as a flt32 + static const int32_t minN = 0x38800000; // min flt16 normal as a flt32 + static const int32_t signN = 0x80000000; // flt32 sign bit + + static const int32_t infC = infN >> shift; + static const int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static const int32_t maxC = maxN >> shift; + static const int32_t minC = minN >> shift; + static const int32_t signC = signN >> shiftSign; // flt16 sign bit + + static const int32_t mulN = 0x52000000; // (1 << 23) / minN + static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static const int32_t subC = 0x003FF; // max flt32 subnormal down shifted + static const int32_t norC = 0x00400; // min flt32 normal down shifted + + static const int32_t maxD = infC - maxC - 1; + static const int32_t minD = minC - subC - 1; + + union { + float f; + int32_t si; + uint32_t ui; + } v, s; + + uint32_t sign; + + v.f = value; + sign = v.si & signN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + return v.ui | sign; +} + #define ISSET(v, fl) ((v) & (fl)) #define ISCLR(v, fl) (!((v) & (fl))) From dbacef66209619b17056fcfce01eaf3e439b6298 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 14:23:31 -0500 Subject: [PATCH 080/597] Add loader for clblast. --- src/gpuarray_blas_opencl_clblas.c | 2 -- src/gpuarray_blas_opencl_clblast.c | 20 +++++++-------- src/gpuarray_buffer_opencl.c | 19 ++++++++------ src/loaders/CMakeLists.txt | 1 + src/loaders/libclblast.c | 41 ++++++++++++++++++++++++++++++ src/loaders/libclblast.fn | 9 +++++++ src/loaders/libclblast.h | 36 ++++++++++++++++++++++++++ 7 files changed, 108 insertions(+), 20 deletions(-) create mode 100644 src/loaders/libclblast.c create mode 100644 src/loaders/libclblast.fn create mode 100644 src/loaders/libclblast.h diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index ecd69e1bc2..837a74af9b 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -36,8 +36,6 @@ static int setup(gpucontext *ctx) { clblasStatus err; if (refcnt == 0) { - GA_CHECK(load_libclblas()); - err = clblasSetup(); if (err != clblasSuccess) return GA_BLAS_ERROR; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 22eda8ed14..91ad847693 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -1,7 +1,7 @@ #include "private.h" #include "private_opencl.h" -#include +#include "loader/libclblast.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" @@ -68,8 +68,8 @@ static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, - (half)alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - (half)beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); + float_to_half(alpha), A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, + float_to_half(beta), C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -205,9 +205,9 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, ARRAY_INIT(X); ARRAY_INIT(Y); - err = CLBlastHgemv(convO(order), convT(transA), M, N, (half)alpha, - A->buf, offA, lda, X->buf, offX, incX, - (half)beta, Y->buf, offY, incY, 1, &ctx->q, &ev); + err = CLBlastHgemv(convO(order), convT(transA), M, N, float_to_half(alpha), + A->buf, offA, lda, X->buf, offX, incX, + float_to_half(beta), Y->buf, offY, incY, 1, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -291,8 +291,8 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C); err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, - (half)alpha, A->buf, offA, lda, B->buf, offB, ldb, - (half)beta, C->buf, offC, ldc, 1, &ctx->q, &ev); + float_to_half(alpha), A->buf, offA, lda, B->buf, offB, ldb, + float_to_half(beta), C->buf, offC, ldc, 1, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -376,8 +376,8 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, ARRAY_INIT(Y); ARRAY_INIT(A); - err = CLBlastHger(convO(order), M, N, (half)alpha, X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); + err = CLBlastHger(convO(order), M, N, float_to_half(alpha), X->buf, offX, incX, + Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index c1a6f16743..ee646a8985 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -12,6 +12,9 @@ #include #include +#include "loader/libclblas.h" +#include "loader/libclblast.h" + #ifdef _MSC_VER #define strdup _strdup #endif @@ -1126,9 +1129,7 @@ static int cl_transfer(gpudata *dst, size_t dstoff, } extern gpuarray_blas_ops clblas_ops; -#ifdef WITH_OPENCL_CLBLAST extern gpuarray_blas_ops clblast_ops; -#endif static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { @@ -1255,12 +1256,14 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: - *((gpuarray_blas_ops **)res) = &clblas_ops; - return GA_NO_ERROR; -#ifdef WITH_OPENCL_CLBLAST - *((gpuarray_blas_ops **)res) = &clblast_ops; - return GA_NO_ERROR; -#endif + { + int e; + if ((e = load_libclblas()) == GA_NO_ERROR) + *((gpuarray_blas_ops **)res) = &clblas_ops; + if ((e = load_libclblast()) == GA_NO_ERROR) + *((gpuarray_blas_ops **)res) = &clblast_ops; + return e; + } case GA_CTX_PROP_COMM_OPS: // TODO Complete in the future whenif a multi-gpu collectives API for diff --git a/src/loaders/CMakeLists.txt b/src/loaders/CMakeLists.txt index de6c0fec76..861349dfda 100644 --- a/src/loaders/CMakeLists.txt +++ b/src/loaders/CMakeLists.txt @@ -6,4 +6,5 @@ libcublas.c libnccl.c libopencl.c libclblas.c +libclblast.c ) diff --git a/src/loaders/libclblast.c b/src/loaders/libclblast.c new file mode 100644 index 0000000000..1a9ba5715d --- /dev/null +++ b/src/loaders/libclblast.c @@ -0,0 +1,41 @@ +#include + +#include "libclblast.h" +#include "dyn_load.h" +#include "gpuarray/error.h" + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +static const char libname[] = "clBLAST.dll"; +#else /* Unix */ +static const char libname[] = "libclBLAST.so"; +#endif + +#define DEF_PROC(ret, name, args) t##name *name + +#include "libclblast.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name); \ + if (name == NULL) { \ + return GA_LOAD_ERROR; \ + } + +static int loaded = 0; + +int load_libclblast(void) { + void *lib; + + if (loaded) + return GA_NO_ERROR; + + lib = ga_load_library(libname); + if (lib == NULL) + return GA_LOAD_ERROR; + + #include "libclblast.fn" + + loaded = 1; + return GA_NO_ERROR; +} diff --git a/src/loaders/libclblast.fn b/src/loaders/libclblast.fn new file mode 100644 index 0000000000..8df16ad784 --- /dev/null +++ b/src/loaders/libclblast.fn @@ -0,0 +1,9 @@ +DEF_PROC(StatusCode, clblasHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, clblasSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastHgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_half beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastSgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastDgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *events)); +DEF_PROC(StatusCode, CLBlastHger, (Layout order, size_t M, size_t N, cl_half alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastSger, (Layout order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastDger, (Layout order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); diff --git a/src/loaders/libclblast.h b/src/loaders/libclblast.h new file mode 100644 index 0000000000..d9cc527cfe --- /dev/null +++ b/src/loaders/libclblast.h @@ -0,0 +1,36 @@ +#ifndef LOADER_LIBCLBLAST_H +#define LOADER_LIBCLBLAST_H + +#include "libopencl.h" + +typedef enum Layout_ { + kRowMajor = 101, + kColumnMajor = 102 +} Layout; + +typedef enum Transpose_ { + kNo = 111, + kYes = 112, + kConjugate = 113 +} Transpose; + +typedef enum StatusCode_ { + kSuccess = 0, + /* Rest is not exposed from here */ +} StatusCode; + +int load_libclblast(void); + +#define DEF_PROC(ret, name, args) typedef ret t##name args + +#include "libclblast.fn" + +#undef DEF_PROC + +#define DEF_PROC(ret, name, args) extern t##name *name + +#include "libclblast.fn" + +#undef DEF_PROC + +#endif From e6685a07632250cbd12a5fdd7739e7162bbbc8dc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 14:57:10 -0500 Subject: [PATCH 081/597] Cleanup errors from rebase. --- src/CMakeLists.txt | 1 + src/gpuarray_blas_opencl_clblast.c | 38 ++++++++++-------------------- src/gpuarray_buffer_cuda.c | 2 +- src/gpuarray_buffer_opencl.c | 4 ++-- src/loaders/libclblast.fn | 4 ++-- src/loaders/libclblast.h | 2 +- src/loaders/libcuda.fn | 5 ++++ src/loaders/libcuda.h | 11 +++++++++ src/loaders/libopencl.h | 4 ++++ src/private.h | 5 +--- 10 files changed, 41 insertions(+), 35 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a50ee0d477..624a856a91 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -53,6 +53,7 @@ gpuarray_blas_cuda_cublas.c gpuarray_collectives_cuda_nccl.c gpuarray_buffer_opencl.c gpuarray_blas_opencl_clblas.c +gpuarray_blas_opencl_clblast.c ) check_function_exists(strlcat HAVE_STRL) diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 91ad847693..28b164799f 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -1,7 +1,7 @@ #include "private.h" #include "private_opencl.h" -#include "loader/libclblast.h" +#include "loaders/libclblast.h" #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" @@ -60,7 +60,6 @@ static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; - cl_uint num_ev = 0; StatusCode err; for (i = 0; i < batchCount; i++) { @@ -69,7 +68,7 @@ static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, float_to_half(alpha), A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - float_to_half(beta), C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); + float_to_half(beta), C[i]->buf, offB[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -90,7 +89,6 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; - cl_uint num_ev = 0; StatusCode err; for (i = 0; i < batchCount; i++) { @@ -99,7 +97,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); + beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -120,7 +118,6 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; - cl_uint num_ev = 0; StatusCode err; for (i = 0; i < batchCount; i++) { @@ -129,7 +126,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, &ev); + beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -198,7 +195,6 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; - cl_uint num_ev = 0; cl_event ev; ARRAY_INIT(A); @@ -207,7 +203,7 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, err = CLBlastHgemv(convO(order), convT(transA), M, N, float_to_half(alpha), A->buf, offA, lda, X->buf, offX, incX, - float_to_half(beta), Y->buf, offY, incY, 1, &ctx->q, &ev); + float_to_half(beta), Y->buf, offY, incY, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -226,7 +222,6 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; - cl_uint num_ev = 0; cl_event ev; ARRAY_INIT(A); @@ -235,7 +230,7 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, err = CLBlastSgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, - beta, Y->buf, offY, incY, 1, &ctx->q, &ev); + beta, Y->buf, offY, incY, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -254,7 +249,6 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; - cl_uint num_ev = 0; cl_event ev; ARRAY_INIT(A); @@ -263,7 +257,7 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, err = CLBlastDgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, - beta, Y->buf, offY, incY, 1, &ctx->q, &ev); + beta, Y->buf, offY, incY, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -283,7 +277,6 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; StatusCode err; - cl_uint num_ev = 0; cl_event ev; ARRAY_INIT(A); @@ -292,7 +285,7 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, float_to_half(alpha), A->buf, offA, lda, B->buf, offB, ldb, - float_to_half(beta), C->buf, offC, ldc, 1, &ctx->q, &ev); + float_to_half(beta), C->buf, offC, ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -312,7 +305,6 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; StatusCode err; - cl_uint num_ev = 0; cl_event ev; ARRAY_INIT(A); @@ -321,7 +313,7 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, - beta, C->buf, offC, ldc, 1, &ctx->q, &ev); + beta, C->buf, offC, ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -341,7 +333,6 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; StatusCode err; - cl_uint num_ev = 0; cl_event ev; ARRAY_INIT(A); @@ -350,7 +341,7 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, - beta, C->buf, offC, ldc, 1, &ctx->q, &ev); + beta, C->buf, offC, ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -369,7 +360,6 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; - cl_uint num_ev = 0; StatusCode err; ARRAY_INIT(X); @@ -377,7 +367,7 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, ARRAY_INIT(A); err = CLBlastHger(convO(order), M, N, float_to_half(alpha), X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); + Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -396,7 +386,6 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; - cl_uint num_ev = 0; StatusCode err; ARRAY_INIT(X); @@ -404,7 +393,7 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, ARRAY_INIT(A); err = CLBlastSger(convO(order), M, N, alpha, X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); + Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; @@ -423,7 +412,6 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; - cl_uint num_ev = 0; StatusCode err; ARRAY_INIT(X); @@ -431,7 +419,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, ARRAY_INIT(A); err = CLBlastDger(convO(order), M, N, alpha, X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, &ev); + Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index aaf2c66ce1..ff13af3bd1 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -75,7 +75,7 @@ static int setup_lib(void) { return GA_IMPL_ERROR; ver = getenv("GPUARRAY_CUDA_VERSION"); if (ver == NULL || strlen(ver) != 2) { - err = gcuDriverGetVersion(&tmp); + err = cuDriverGetVersion(&tmp); if (err != CUDA_SUCCESS) return GA_IMPL_ERROR; major = tmp / 1000; diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index ee646a8985..a9390eefe4 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -12,8 +12,8 @@ #include #include -#include "loader/libclblas.h" -#include "loader/libclblast.h" +#include "loaders/libclblas.h" +#include "loaders/libclblast.h" #ifdef _MSC_VER #define strdup _strdup diff --git a/src/loaders/libclblast.fn b/src/loaders/libclblast.fn index 8df16ad784..544c164e0c 100644 --- a/src/loaders/libclblast.fn +++ b/src/loaders/libclblast.fn @@ -1,5 +1,5 @@ -DEF_PROC(StatusCode, clblasHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, clblasSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(StatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastHgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_half beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastSgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); diff --git a/src/loaders/libclblast.h b/src/loaders/libclblast.h index d9cc527cfe..a507b37d2d 100644 --- a/src/loaders/libclblast.h +++ b/src/loaders/libclblast.h @@ -5,7 +5,7 @@ typedef enum Layout_ { kRowMajor = 101, - kColumnMajor = 102 + kColMajor = 102 } Layout; typedef enum Transpose_ { diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn index 05d3e84ad0..4a22853442 100644 --- a/src/loaders/libcuda.fn +++ b/src/loaders/libcuda.fn @@ -6,6 +6,7 @@ DEF_PROC(cuDeviceGet, (CUdevice *device, int ordinal)); DEF_PROC(cuDeviceGetCount, (int *count)); DEF_PROC(cuDeviceGetName, (char *name, int len, CUdevice dev)); DEF_PROC(cuDeviceGetAttribute, (int *pi, CUdevice_attribute attrib, CUdevice dev)); +DEF_PROC(cuDeviceGetPCIBusId, (char *pciBusId, int len, CUdevice dev)); DEF_PROC(cuDevicePrimaryCtxGetState, (CUdevice dev, unsigned int *flags, int *active)); DEF_PROC(cuDevicePrimaryCtxSetFlags, (CUdevice dev, unsigned int flags)); @@ -46,3 +47,7 @@ DEF_PROC(cuStreamCreate, (CUstream *phStream, unsigned int Flags)); DEF_PROC(cuStreamWaitEvent, (CUstream hStream, CUevent hEvent, unsigned int Flags)); DEF_PROC(cuStreamSynchronize, (CUstream hStream)); DEF_PROC_V2(cuStreamDestroy, (CUstream hStream)); + +DEF_PROC(cuIpcGetMemHandle, (CUipcMemHandle *pHandle, CUdeviceptr dptr)); +DEF_PROC(cuIpcOpenMemHandle, (CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags)); +DEF_PROC(cuIpcCloseMemHandle, (CUdeviceptr dptr)); \ No newline at end of file diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h index 0fdae34e53..865d86b560 100644 --- a/src/loaders/libcuda.h +++ b/src/loaders/libcuda.h @@ -28,6 +28,13 @@ typedef enum CUdevice_attribute_enum CUdevice_attribute; typedef enum CUfunction_attribute_enum CUfunction_attribute; typedef enum CUevent_flags_enum CUevent_flags; typedef enum CUctx_flags_enum CUctx_flags; +typedef enum CUipcMem_flags_enum CUipcMem_flags; + +#define CU_IPC_HANDLE_SIZE 64 + +typedef struct CUipcMemHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcMemHandle; int load_libcuda(void); @@ -173,4 +180,8 @@ enum CUctx_flags_enum { CU_CTX_MAP_HOST = 0x08, }; +enum CUipcMem_flags_enum { + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 +}; + #endif diff --git a/src/loaders/libopencl.h b/src/loaders/libopencl.h index 8aacb77af6..fd2a04564e 100644 --- a/src/loaders/libopencl.h +++ b/src/loaders/libopencl.h @@ -23,6 +23,8 @@ typedef signed __int32 cl_int; typedef unsigned __int32 cl_uint; typedef signed __int64 cl_long; typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; typedef float cl_float; typedef double cl_double; #else @@ -31,6 +33,8 @@ typedef int32_t cl_int __attribute__((aligned(4))); typedef uint32_t cl_uint __attribute__((aligned(4))); typedef int64_t cl_long __attribute__((aligned(8))); typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); typedef float cl_float __attribute__((aligned(4))); typedef double cl_double __attribute__((aligned(8))); #endif diff --git a/src/private.h b/src/private.h index 730549e841..e9f3269dcb 100644 --- a/src/private.h +++ b/src/private.h @@ -264,7 +264,7 @@ GPUARRAY_LOCAL void gpukernel_source_with_line_numbers(unsigned int count, size_t *newl, strb *src); -static uint16_t float_to_half(float value) { +static inline uint16_t float_to_half(float value) { static const int shift = 13; static const int shiftSign = 16; @@ -277,13 +277,10 @@ static uint16_t float_to_half(float value) { static const int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 static const int32_t maxC = maxN >> shift; static const int32_t minC = minN >> shift; - static const int32_t signC = signN >> shiftSign; // flt16 sign bit static const int32_t mulN = 0x52000000; // (1 << 23) / minN - static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) static const int32_t subC = 0x003FF; // max flt32 subnormal down shifted - static const int32_t norC = 0x00400; // min flt32 normal down shifted static const int32_t maxD = infC - maxC - 1; static const int32_t minD = minC - subC - 1; From 4b8632b8cbc0019227f06763639319775771c5e4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 17:08:43 -0500 Subject: [PATCH 082/597] Fix error path in setup_lib() for cuda. --- src/gpuarray_buffer_cuda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index ff13af3bd1..e90983929a 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -69,7 +69,7 @@ static int setup_lib(void) { if (!setup_done) { res = load_libcuda(); if (res != GA_NO_ERROR) - return err; + return res; err = cuInit(0); if (err != CUDA_SUCCESS) return GA_IMPL_ERROR; @@ -88,7 +88,7 @@ static int setup_lib(void) { return GA_VALUE_ERROR; res = load_libnvrtc(major, minor); if (res != GA_NO_ERROR) - return err; + return res; setup_done = 1; } return GA_NO_ERROR; From ee12e30d845487b6c8bd8edb9184d8525e40bf97 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 17:09:07 -0500 Subject: [PATCH 083/597] Add debugging message when in debug mode. --- src/loaders/dyn_load.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index 6742ace487..2d4121fce2 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -1,15 +1,27 @@ #include "dyn_load.h" -#ifdef __unix__ +#if defined(__unix__) || defined(__APPLE__) #include +#include +#include void *ga_load_library(const char *name) { - return dlopen(name, RTLD_LAZY|RTLD_LOCAL); + void *res = dlopen(name, RTLD_LAZY|RTLD_LOCAL); +#ifdef DEBUG + if (res == NULL) + warn("dlopen: %s", name); +#endif + return res; } void *ga_func_ptr(void *h, const char *name) { - return dlsym(h, name); + void *res = dlsym(h, name); +#ifdef DEBUG + if (res == NULL) + warn("dlsym: %s", name); +#endif + return res; } #else From 0b935e841bbdaae8956f05bd238facd7a6ebc073 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 17:22:58 -0500 Subject: [PATCH 084/597] Adjust things so we can load cuda libraries on macOS. --- src/loaders/libcublas.c | 11 +++++++++++ src/loaders/libcuda.c | 4 ++++ src/loaders/libnccl.c | 16 ++++++++++------ src/loaders/libnvrtc.c | 12 ++++++++++++ 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index dedfd20927..acc425479d 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -41,6 +41,7 @@ int load_libcublas(int major, int minor) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { + static const char DIGITS[] = "0123456789"; char libname[] = "cublas64_??.dll"; libname[9] = DIGITS[major]; @@ -49,7 +50,17 @@ int load_libcublas(int major, int minor) { lib = ga_load_library(libname); } #else /* Unix */ +#ifdef __APPLE__ + { + static const char DIGITS[] = "0123456789"; + char libname[] = "/Developer/NVIDIA/CUDA-?.?/lib/libcublas.dylib"; + libname[23] = DIGITS[major]; + libname[25] = DIGITS[minor]; + lib = ga_load_library(libname); + } +#else lib = ga_load_library("libcublas.so"); +#endif #endif if (lib == NULL) return GA_LOAD_ERROR; diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index ab8d149c89..448791a678 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -8,8 +8,12 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static char libname[] = "nvcuda.dll"; #else /* Unix */ +#ifdef __APPLE__ +static char libname[] = "CUDA.framework/CUDA"; +#else static char libname[] = "libcuda.so"; #endif +#endif #define DEF_PROC(name, args) t##name *name #define DEF_PROC_V2(name, args) DEF_PROC(name, args) diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c index 0b68759844..341be93451 100644 --- a/src/loaders/libnccl.c +++ b/src/loaders/libnccl.c @@ -4,18 +4,21 @@ #include "dyn_load.h" #include "gpuarray/error.h" -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -static const char libname[] = "nccl.dll"; -#else /* Unix */ -static const char libname[] = "libnccl.so"; -#endif - #define DEF_PROC(ret, name, args) t##name *name #include "libnccl.fn" #undef DEF_PROC +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(__APPLE__) +/* As far as we know, nccl is not available or buildable on platforms + other than linux */ +int load_libnccl(void) { + return GA_UNSUPPORTED_ERROR; +} +#else /* Unix */ +static const char libname[] = "libnccl.so"; + #define DEF_PROC(ret, name, args) \ name = (t##name *)ga_func_ptr(lib, #name); \ if (name == NULL) { \ @@ -39,3 +42,4 @@ int load_libnccl(void) { loaded = 1; return GA_NO_ERROR; } +#endif diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c index 89e14cad09..5d967f8a98 100644 --- a/src/loaders/libnvrtc.c +++ b/src/loaders/libnvrtc.c @@ -27,6 +27,7 @@ int load_libnvrtc(int major, int minor) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { + static const char DIGITS[] = "0123456789"; char libname[] = "nvrtc64_??.dll"; libname[8] = DIGITS[major]; @@ -35,7 +36,18 @@ int load_libnvrtc(int major, int minor) { lib = ga_load_library(libname); } #else /* Unix */ +#ifdef __APPLE__ + { + static const char DIGITS[] = "0123456789"; + /* Try the usual fullpath first */ + char libname[] = "/Developer/NVIDIA/CUDA-?.?/lib/libnvrtc.dylib"; + libname[23] = DIGITS[major]; + libname[25] = DIGITS[minor]; + lib = ga_load_library(libname); + } +#else lib = ga_load_library("libnvrtc.so"); +#endif #endif if (lib == NULL) return GA_LOAD_ERROR; From 9e311de3f1645590c8d6cc5601ca84263e330f5a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 17:37:12 -0500 Subject: [PATCH 085/597] Fix dummy kernel for driver that require at least one argument. --- src/gpuarray_buffer_opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index a9390eefe4..2a6833649e 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -121,7 +121,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { int e = 0; size_t warp_size; int ret; - const char dummy_kern[] = "__kernel void kdummy() {}\n"; + const char dummy_kern[] = "__kernel void kdummy(float f) {}\n"; strb context_preamble = STRB_STATIC_INIT; const char *rlk[1]; gpukernel *m; From d70525a2af6af83635f50f3f81e71c910912ddcb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Nov 2016 18:18:08 -0500 Subject: [PATCH 086/597] Adjust the loaders for macOS. --- src/loaders/libclblas.c | 4 ++++ src/loaders/libclblast.c | 8 ++++++-- src/loaders/libopencl.c | 4 ++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/loaders/libclblas.c b/src/loaders/libclblas.c index 00b0ead5a4..cdb17fa39b 100644 --- a/src/loaders/libclblas.c +++ b/src/loaders/libclblas.c @@ -7,8 +7,12 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static const char libname[] = "clBLAS.dll"; #else /* Unix */ +#ifdef __APPLE__ +static const char libname[] = "libclBLAS.dylib"; +#else static const char libname[] = "libclBLAS.so"; #endif +#endif #define DEF_PROC(ret, name, args) t##name *name diff --git a/src/loaders/libclblast.c b/src/loaders/libclblast.c index 1a9ba5715d..1bb4cc9999 100644 --- a/src/loaders/libclblast.c +++ b/src/loaders/libclblast.c @@ -5,9 +5,13 @@ #include "gpuarray/error.h" #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -static const char libname[] = "clBLAST.dll"; +static const char libname[] = "clblast.dll"; #else /* Unix */ -static const char libname[] = "libclBLAST.so"; +#ifdef __APPLE__ +static const char libname[] = "libclblast.dylib"; +#else +static const char libname[] = "libclblast.so"; +#endif #endif #define DEF_PROC(ret, name, args) t##name *name diff --git a/src/loaders/libopencl.c b/src/loaders/libopencl.c index ec1f89614a..c3e11d3c0c 100644 --- a/src/loaders/libopencl.c +++ b/src/loaders/libopencl.c @@ -7,8 +7,12 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static char libname[] = "OpenCL.dll"; #else /* Unix */ +#ifdef __APPLE__ +static char libname[] = "OpenCL.framework/OpenCL"; +#else static char libname[] = "libOpenCL.so"; #endif +#endif #define DEF_PROC(ret, name, args) t##name *name From 0cd6f1b7c7507d6b40d2f3f8dcf3dc4f49158221 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 15 Nov 2016 16:47:39 -0500 Subject: [PATCH 087/597] Remove trailing whitespace (Atom insists) --- src/util/integerfactoring.c | 126 ++++++++++++++++++------------------ src/util/integerfactoring.h | 5 +- 2 files changed, 65 insertions(+), 66 deletions(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index fdd7c76875..c11e68309f 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -42,11 +42,11 @@ static int gaIClz(uint64_t n); /** * @brief Integer Modular Addition. - * + * * Computes - * + * * $$a+b \pmod m$$ - * + * * efficiently for 64-bit unsigned integers a, b, m. */ @@ -54,11 +54,11 @@ static uint64_t gaIAddMod (uint64_t a, uint64_t b, uint64_t m); /** * @brief Integer Modular Subtraction. - * + * * Computes - * + * * $$a-b \pmod m$$ - * + * * efficiently for 64-bit unsigned integers a, b, m. */ @@ -66,11 +66,11 @@ static uint64_t gaISubMod (uint64_t a, uint64_t b, uint64_t m); /** * @brief Integer Modular Average. - * + * * Computes - * + * * $$\frac{a+b}{2} \pmod m$$ - * + * * efficiently for 64-bit unsigned integers a, b, m. */ @@ -102,11 +102,11 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m); /** * @brief Jacobi Symbol - * + * * Computes the Jacobi symbol, notated - * + * * $$(a/n)$$ - * + * * efficiently for 64-bit unsigned integers a, n. */ @@ -114,7 +114,7 @@ static int gaIJacobiSymbol(uint64_t a, uint64_t n); /** * @brief Strong Fermat base-a probable prime test. - * + * * @param [in] n An odd integer >= 3. * @param [in] a A witness integer > 0. * @return Non-zero if n is a strong probable prime to base a and zero if n is @@ -125,9 +125,9 @@ static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a); /** * @brief Strong Lucas probable prime test. - * + * * The function uses Selfridge's Method A for selecting D,P,Q. - * + * * @param [in] n An odd integer >= 3. * @return Non-zero if n is a strong probable prime and zero if n is composite. */ @@ -224,7 +224,7 @@ static int gaIClz (uint64_t n){ static uint64_t gaIAddMod (uint64_t a, uint64_t b, uint64_t m){ a %= m; b %= m; - + if(m-a > b){ return a+b; }else{ @@ -235,7 +235,7 @@ static uint64_t gaIAddMod (uint64_t a, uint64_t b, uint64_t m){ static uint64_t gaISubMod (uint64_t a, uint64_t b, uint64_t m){ a %= m; b %= m; - + if(a >= b){ return a-b; }else{ @@ -245,7 +245,7 @@ static uint64_t gaISubMod (uint64_t a, uint64_t b, uint64_t m){ static uint64_t gaIAvgMod (uint64_t a, uint64_t b, uint64_t m){ uint64_t s = gaIAddMod(a,b,m); - + if(s&1){ return (s>>1)+(m>>1)+(s&m&1); }else{ @@ -389,20 +389,20 @@ static uint64_t gaIPowMod (uint64_t x, uint64_t a, uint64_t m){ static int gaIJacobiSymbol(uint64_t a, uint64_t n){ int s=0; uint64_t e, a1, n1; - + a %= n; - + if(a == 1 || n == 1){ return 1; } - + if(a == 0){ return 0; } - + e = gaICtz(a); a1 = a >> e; - + if(e%2 == 0){ s = 1; }else if(n%8 == 1 || n%8 == 7){ @@ -410,11 +410,11 @@ static int gaIJacobiSymbol(uint64_t a, uint64_t n){ }else if(n%8 == 3 || n%8 == 5){ s = -1; } - + if(n%4 == 3 && a1%4 == 3){ s = -s; } - + n1 = n%a1; return s*gaIJacobiSymbol(n1,a1); } @@ -426,32 +426,32 @@ static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a){ * Should it fail to prove an integer composite, it reports the number as * "probably prime". However, if the witnesses are chosen carefully, the * Miller-Rabin test can be made deterministic below a chosen threshold. - * + * * One can use the primes 2 to 37 in order to ensure the correctness of the * identifications for integers under 2^64. - * + * * Jim Sinclair has found that the seven witnesses * 2, 325, 9375, 28178, 450775, 9780504, 1795265022 * also deterministically classify all integers <2^64. - * - * + * + * * The Fermat strong probable prime test states that, for integers * n = d*2^s+1, d odd, s integer >= 0 * a integer (chosen witness) * n is a Fermat strong probable prime if * a^(d ) = 1 mod n or * a^(d*2^r) = -1 mod n for any integer r, 0 <= r < s. - * - * + * + * * The justification for this comes from Fermat's Little Theorem: If n is * prime and a is any integer, then the following always holds: * a^n = a mod n * If n is prime and a is coprime to n, then the following always holds: * a^(n-1) = 1 mod n - * - * + * + * * In effect, the logic goes - * + * * A: The number n is prime. (Statement) * B: The number n does not divide a. (Statement) * C: a^( n-1) = 1 mod n (Statement) @@ -466,7 +466,7 @@ static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a){ * L: a^(d*2^(r+1)) = 1 mod n for some 0 <= r < s. (Statement) * M: a^(d*2^r) != +-1 mod n AND (Statement) * a^(d*2^(r+1)) = 1 mod n for some 0 <= r < s. - * + * * A&B --> C (Proposition: Fermat's Little Theorem) * !C --> !(A&B) = !A|!B (Contrapositive: Fermat's Little Theorem) * A <-> D (Proposition) @@ -501,7 +501,7 @@ static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a){ * ***** Conclusions: ***** * H&I&M --> !A * H&I&!(J|K)&B --> !A - * + * * Broadly speaking, what the above tells us is: * - We can't prove n prime (A), but we can prove it composite (!A). * - Either H&I&M or H&I&!(J|K)&B prove compositeness. @@ -510,23 +510,23 @@ static int gaIIsPrimeStrongFermat(uint64_t n, uint64_t a){ * conclusions about the truth-value of A can be made. The test is * inconclusive. Thus this function returns "probably prime". */ - + uint64_t d, x; int64_t s, r; - + a %= n; if(a==0){ return GA_IS_PROBABLY_PRIME; } - + s = gaICtz(n-1); d = (n-1) >> s; x = gaIPowMod(a,d,n); - + if(x==1 || x==n-1){ return GA_IS_PROBABLY_PRIME; } - + for(r=0;r=0;i--){ Ut = gaIMulMod(U,V,n); Vt = gaIAvgMod(gaIMulMod(V,V,n), gaIMulMod(D,gaIMulMod(U,U,n),n), n); @@ -612,11 +612,11 @@ static int gaIIsPrimeStrongLucas(uint64_t n){ V = Vt; } } - + /** * 7. If U0==0, then return "probably prime". Otherwise, return "composite". */ - + return U==0 ? GA_IS_PROBABLY_PRIME : GA_IS_COMPOSITE; } @@ -761,7 +761,7 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl /** * Master loop. - * + * * We arrive here with finite slack and all optimal 2-, 3- and 5-smooth * factorizers unable to produce a factorization whose product is less * than or equal to maxN. @@ -817,7 +817,7 @@ int gaIFactorize (uint64_t n, uint64_t maxN, uint64_t k, ga_factor_list* fl }else{ p = gaIFLGetProduct(fl); newX = n/p; - newX += newX*p < n; + newX += newX*p < n; if(newX < x){ x = newX; goto subfactorize; diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index 0687c49ee1..a143a33850 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -176,7 +176,7 @@ int gaIFLGetFactorPower(const ga_factor_list* fl, uint64_t f); /** * @brief Compute the product of the factors stored in the factors list. - * + * * NB: This function may return an overflowed result. To detect if it will, * please call gaIFLIsOverflowed(fl). */ @@ -203,7 +203,7 @@ uint64_t gaIFLGetSmallestFactor(const ga_factor_list* fl); /** * @brief Print out the factor list in a human-readable form, sprintf()-style. - * + * * @param [out] str A string into which to print out the factor list. If the * factor list is a result of gaIFactorize(), then the * maximum length of buffer required is 128 bytes. @@ -275,4 +275,3 @@ void gaISchedule (const int n, /* End Include Guards */ #endif - From 95b46306d6600dea9b115ee697536e17b477d636 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 15 Nov 2016 18:53:46 -0500 Subject: [PATCH 088/597] Fix build problems on windows. --- src/cache.h | 2 +- src/gen_types.py | 2 +- src/gpuarray_array.c | 4 +- src/gpuarray_reduction.c | 237 ++++++++++++++++++------------------ src/gpuarray_types.c | 2 +- src/loaders/dyn_load.c | 2 + src/private.h | 70 +++++++---- src/util/integerfactoring.c | 1 - src/util/integerfactoring.h | 2 +- 9 files changed, 172 insertions(+), 150 deletions(-) diff --git a/src/cache.h b/src/cache.h index 800208e04d..f2059e73cc 100644 --- a/src/cache.h +++ b/src/cache.h @@ -1,8 +1,8 @@ #ifndef CACHE_H #define CACHE_H -#include #include +#include #include "private_config.h" typedef void *cache_key_t; diff --git a/src/gen_types.py b/src/gen_types.py index c9e356e9a9..3e14c9a4f2 100644 --- a/src/gen_types.py +++ b/src/gen_types.py @@ -73,7 +73,7 @@ def add_type(name, C, sz): int16_t exp; uint16_t hi; uint32_t lo; - }; + } s; uint128_t raw; } u; } ga_quad; diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index fbd1aa777a..07e3a7fd9e 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -5,14 +5,12 @@ #include #include #include -#if _MSC_VER < 1600 -#include -#endif #include #include #include #include "private.h" +#include "gpuarray/config.h" #include "gpuarray/array.h" #include "gpuarray/error.h" #include "gpuarray/kernel.h" diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 0e6ba09749..15391bad69 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -29,12 +29,12 @@ struct maxandargmax_ctx{ const GpuArray* src; int reduxLen; const int* reduxList; - + /* General. */ int ret; int* axisList; gpucontext* gpuCtx; - + /* Source code Generator. */ const char* dstMaxType; const char* dstArgmaxType; @@ -45,13 +45,13 @@ struct maxandargmax_ctx{ strb s; char* sourceCode; GpuKernel kernel; - + /* Scheduler */ int hwAxisList[3]; size_t blockSize [3]; size_t gridSize [3]; size_t chunkSize [3]; - + /* Invoker */ gpudata* srcStepsGD; gpudata* srcSizeGD; @@ -102,10 +102,15 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ - maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src, - (int)reduxLen, (const int*)reduxList}, - *ctx = &ctxSTACK; - + maxandargmax_ctx ctxSTACK = {0}; + maxandargmax_ctx *ctx = &ctxSTACK; + + ctxSTACK.dstMax = dstMax; + ctxSTACK.dstArgmax = dstArgmax; + ctxSTACK.src = src; + ctxSTACK.reduxLen = (int)reduxLen; + ctxSTACK.reduxList = (const int*)reduxList; + if(maxandargmaxCheckargs (ctx) == GA_NO_ERROR && maxandargmaxSelectHwAxes(ctx) == GA_NO_ERROR && maxandargmaxGenSource (ctx) == GA_NO_ERROR && @@ -120,7 +125,7 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, /** * @brief Check whether axis numbered v is already in the given set of axes. - * + * * @param [in] v * @param [in] set * @param [in] setLen @@ -133,22 +138,22 @@ static int axisInSet (int v, size_t setLen, size_t* where){ size_t i; - + for(i=0;iret = GA_NO_ERROR; ctx->axisList = NULL; ctx->gpuCtx = NULL; - + ctx->dstMaxType = ctx->dstArgmaxType = NULL; ctx->ndh = 0; - ctx->s = (strb)STRB_STATIC_INIT; ctx->sourceCode = NULL; - + ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0; ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1; ctx->gridSize [0] = ctx->gridSize [1] = ctx->gridSize [2] = 1; ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1; - + ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL; - - + + /* Insane src or reduxLen? */ if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 || ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){ return ctx->ret=GA_INVALID_ERROR; } - + /* Insane or duplicate list entry? */ for(i=0;ireduxLen;i++){ if(ctx->reduxList[i] < 0 || @@ -229,30 +233,30 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ return ctx->ret=GA_INVALID_ERROR; } } - + /* Unknown type? */ ctx->dstMaxType = gpuarray_get_type(ctx->src->typecode)->cluda_name; ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE) ->cluda_name; if(!ctx->dstMaxType || !ctx->dstArgmaxType){ return ctx->ret=GA_INVALID_ERROR; } - + /* GPU context non-existent? */ ctx->gpuCtx = GpuArray_context(ctx->src); if(!ctx->gpuCtx){ return ctx->ret=GA_INVALID_ERROR; } - - + + /** * We initialize some more parts of the context, using the guarantees * we now have about the sanity of the arguments. */ - + ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; - + return ctx->ret; } @@ -264,17 +268,17 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ int i, j, maxI = 0; size_t maxV; - + ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3; - + /** * The ctx->hwAxisLen largest axes are selected and assigned in * descending order to X, Y, Z. */ - + for(i=0;indh;i++){ maxV = 0; - + for(j=0;jnds;j++){ if(!axisInSet(j, ctx->hwAxisList, i, 0) && !axisInSet(j, ctx->reduxList, ctx->ndr, 0) && @@ -283,16 +287,16 @@ static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ maxI = j; } } - + ctx->hwAxisList[i] = maxI; } - + return ctx->ret=GA_NO_ERROR; } /** * @brief Generate the kernel code for MaxAndArgmax. - * + * * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. */ @@ -303,7 +307,7 @@ static int maxandargmaxGenSource (maxandargmax_ctx* ctx){ return ctx->ret=GA_MEMORY_ERROR; } maxandargmaxComputeAxisList(ctx); - + /* Generate kernel proper. */ strb_ensure(&ctx->s, 5*1024); maxandargmaxAppendKernel(ctx); @@ -313,7 +317,7 @@ static int maxandargmaxGenSource (maxandargmax_ctx* ctx){ if(!ctx->sourceCode){ return ctx->ret=GA_MEMORY_ERROR; } - + /* Return it. */ return ctx->ret=GA_NO_ERROR; } @@ -359,7 +363,7 @@ static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ int i; strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); - + strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); @@ -371,11 +375,11 @@ static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ i, i, (i==ctx->ndh-1) ? ";\n" : ", "); } } - + strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); - + if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} @@ -384,17 +388,17 @@ static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} - + strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ size_t hwDim; int i; - + /* Use internal remapping when computing the ranges for this thread. */ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); - + for(i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); } @@ -412,7 +416,7 @@ static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ * If this is the last index, it's the first cumulative dimension * product we generate, and thus we initialize to 1. */ - + if(i == ctx->nds-1){ strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); }else{ @@ -424,7 +428,7 @@ static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - + if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ @@ -436,14 +440,14 @@ static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - + if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } } - + strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } @@ -452,60 +456,60 @@ static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t * FREE LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); - + maxandargmaxAppendLoopMacroDefs (ctx); maxandargmaxAppendLoopOuter (ctx); maxandargmaxAppendLoopMacroUndefs(ctx); } static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ int i; - + /** * FOROVER Macro */ - + strb_appends(&ctx->s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); - + /** * ESCAPE Macro */ - + strb_appends(&ctx->s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); - + /** * SRCINDEXER Macro */ - + appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); for(i=0;inds;i++){ strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); - + /** * RDXINDEXER Macro */ - + appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ") ("); for(i=ctx->ndd;inds;i++){ strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); } strb_appends(&ctx->s, "0)\n"); - + /** * DSTMINDEXER Macro */ - + appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); for(i=0;indd;i++){ strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); - + /** * DSTAINDEXER Macro */ - + appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); for(i=0;indd;i++){ strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n ", i, i); @@ -514,85 +518,85 @@ static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ } static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ int i; - + /** * Outer Loop Header Generation */ - + for(i=0;indd;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } - + /** * Inner Loop Generation */ - + maxandargmaxAppendLoopInner(ctx); - + /** * Outer Loop Trailer Generation */ - + for(i=0;indd;i++){ strb_appends(&ctx->s, "\t}\n"); } } static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx){ int i; - + /** * Inner Loop Prologue */ - + strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Reduction initialization.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); - + appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", ""); if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");} appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n"); - + appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n"); - + strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); - + /** * Inner Loop Header Generation */ - + for(i=ctx->ndd;inds;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } - + /** * Inner Loop Body Generation */ - + appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\tif(V > maxV){\n"); strb_appends(&ctx->s, "\t\tmaxV = V;\n"); appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t}\n"); - + /** * Inner Loop Trailer Generation */ - + for(i=ctx->ndd;inds;i++){ strb_appends(&ctx->s, "\t}\n"); } strb_appends(&ctx->s, "\t\n"); - + /** * Inner Loop Epilogue Generation */ - + strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Destination writeback.\n"); strb_appends(&ctx->s, "\t */\n"); @@ -610,7 +614,7 @@ static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){ } static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ int i, f=0; - + for(i=0;inds;i++){ if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ continue; @@ -622,7 +626,7 @@ static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ /** * @brief Compile the kernel from source code. - * + * * @return */ @@ -641,15 +645,15 @@ static int maxandargmaxCompile (maxandargmax_ctx* ctx){ GA_BUFFER /* dstArgmaxSteps */ }; const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); - const char* SRCS[] = {ctx->sourceCode}; - const size_t SRC_LENS[] = {strlen(ctx->sourceCode)}; - const size_t SRCS_LEN = sizeof(SRCS)/sizeof(*SRCS); - + const char* SRCS[1]; + + SRCS[0] = ctx->sourceCode; + ctx->ret = GpuKernel_init(&ctx->kernel, ctx->gpuCtx, - SRCS_LEN, + 1, SRCS, - SRC_LENS, + NULL, "maxandargmax", ARG_TYPECODES_LEN, ARG_TYPECODES, @@ -657,7 +661,7 @@ static int maxandargmaxCompile (maxandargmax_ctx* ctx){ (char**)0); free(ctx->sourceCode); ctx->sourceCode = NULL; - + return ctx->ret; } @@ -679,12 +683,12 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ ga_factor_list factBS[3]; ga_factor_list factGS[3]; ga_factor_list factCS[3]; - - + + /** * Obtain the constraints of our problem. */ - + size_t warpSize, maxL, maxL0, maxL1, maxL2, /* Maximum total and per-dimension thread/block sizes */ maxG, maxG0, maxG1, maxG2; /* Maximum total and per-dimension block /grid sizes */ @@ -697,75 +701,75 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); - + /** * Prepare inputs to the solver. - * + * * This involves, amongst others, * - Initializing the blockSize, gridSize and chunkSize factor lists for all * hardware dimensions. * - Finding on which hardware axis is it optimal to place the warpSize factor. */ - + maxLg = maxL; maxLs[0] = maxL0, maxLs[1]=maxL1, maxLs[2]=maxL2; maxGg = maxG; maxGs[0] = maxG0, maxGs[1]=maxG1, maxGs[2]=maxG2; dims[0] = dims[1] = dims[2] = 1; slack[0] = slack[1] = slack[2] = 1.1; - + for(i=0;indh;i++){ dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]]; gaIFLInit(&factBS[i]); gaIFLInit(&factGS[i]); gaIFLInit(&factCS[i]); - + warpMod = dims[i]%warpSize; if(bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){ bestWarpAxis = i; bestWarpMod = warpMod; } } - + if(ctx->ndh > 0){ dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); } - + /** * Factorization job. We'll steadily increase the slack in case of failure * in order to ensure we do get a factorization, which we place into * chunkSize. */ - + for(i=0;indh;i++){ - while(!gaIFactorize(dims[i], dims[i]*slack[i], maxLs[i], &factCS[i])){ + while(!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){ /** * Error! Failed to factorize dimension i with given slack and * k-smoothness constraints! Increase slack. Once slack reaches * 2.0 it will factorize guaranteed. */ - + slack[i] += 0.1; } } - + /** * Invoke the scheduler. - * + * * The scheduler will move some factors from chunkSize into blockSize and * gridSize, improving performance. */ - + gaIFLSchedule(ctx->ndh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); - + /* Output. */ for(i=0;indh;i++){ ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]); ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]); ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]); } - + /* Return. */ return ctx->ret=GA_NO_ERROR; } @@ -776,11 +780,11 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ void* args[11]; - + /** * Argument Marshalling. This the grossest gross thing in here. */ - + const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), ctx->src->strides, flags, 0); @@ -803,7 +807,7 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ args[ 8] = (void*) ctx->dstArgmax->data; args[ 9] = (void*)&ctx->dstArgmax->offset; args[10] = (void*) ctx->dstArgmaxStepsGD; - + if(ctx->srcStepsGD && ctx->srcSizeGD && ctx->chunkSizeGD && @@ -818,13 +822,13 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ }else{ ctx->ret = GA_MEMORY_ERROR; } - + gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD); gpudata_release(ctx->chunkSizeGD); gpudata_release(ctx->dstMaxStepsGD); gpudata_release(ctx->dstArgmaxStepsGD); - + return ctx->ret; } @@ -837,7 +841,6 @@ static int maxandargmaxCleanup (maxandargmax_ctx* ctx){ free(ctx->sourceCode); ctx->axisList = NULL; ctx->sourceCode = NULL; - + return ctx->ret; } - diff --git a/src/gpuarray_types.c b/src/gpuarray_types.c index 719d5b1910..01477a9336 100644 --- a/src/gpuarray_types.c +++ b/src/gpuarray_types.c @@ -40,7 +40,7 @@ typedef struct _quad { int16_t exp; uint16_t hi; uint32_t lo; - }; + } s; uint128_t raw; } u; } ga_quad; diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index 2d4121fce2..3586b72725 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -25,7 +25,9 @@ void *ga_func_ptr(void *h, const char *name) { } #else + /* Should be windows */ +#include void *ga_load_library(const char *name) { return LoadLibrary(name); diff --git a/src/private.h b/src/private.h index e9f3269dcb..c0e0d7c5d1 100644 --- a/src/private.h +++ b/src/private.h @@ -265,25 +265,25 @@ GPUARRAY_LOCAL void gpukernel_source_with_line_numbers(unsigned int count, strb *src); static inline uint16_t float_to_half(float value) { - static const int shift = 13; - static const int shiftSign = 16; +#define ga__shift 13 +#define ga__shiftSign 16 - static const int32_t infN = 0x7F800000; // flt32 infinity - static const int32_t maxN = 0x477FE000; // max flt16 normal as a flt32 - static const int32_t minN = 0x38800000; // min flt16 normal as a flt32 - static const int32_t signN = 0x80000000; // flt32 sign bit +#define ga__infN 0x7F800000 // flt32 infinity +#define ga__maxN 0x477FE000 // max flt16 normal as a flt32 +#define ga__minN 0x38800000 // min flt16 normal as a flt32 +#define ga__signN 0x80000000 // flt32 sign bit - static const int32_t infC = infN >> shift; - static const int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 - static const int32_t maxC = maxN >> shift; - static const int32_t minC = minN >> shift; +#define ga__infC (ga__infN >> ga__shift) +#define ga__nanN ((ga__infC + 1) << ga__shift) // minimum flt16 nan as a flt32 +#define ga__maxC (ga__maxN >> ga__shift) +#define ga__minC (ga__minN >> ga__shift) - static const int32_t mulN = 0x52000000; // (1 << 23) / minN +#define ga__mulN 0x52000000 // (1 << 23) / minN - static const int32_t subC = 0x003FF; // max flt32 subnormal down shifted +#define ga__subC 0x003FF // max flt32 subnormal down shifted - static const int32_t maxD = infC - maxC - 1; - static const int32_t minD = minC - subC - 1; +#define ga__maxD (ga__infC - ga__maxC - 1) +#define ga__minD (ga__minC - ga__subC - 1) union { float f; @@ -294,18 +294,38 @@ static inline uint16_t float_to_half(float value) { uint32_t sign; v.f = value; - sign = v.si & signN; + sign = v.si & ga__signN; v.si ^= sign; - sign >>= shiftSign; // logical shift - s.si = mulN; - s.si = s.f * v.f; // correct subnormals - v.si ^= (s.si ^ v.si) & -(minN > v.si); - v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); - v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - v.ui >>= shift; // logical shift - v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); - v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); - return v.ui | sign; + sign >>= ga__shiftSign; // logical shift + s.si = ga__mulN; + s.si = (int32_t)(s.f * v.f); // correct subnormals + v.si ^= (s.si ^ v.si) & -(ga__minN > v.si); + v.si ^= (ga__infN ^ v.si) & -((ga__infN > v.si) & (v.si > ga__maxN)); + v.si ^= (ga__nanN ^ v.si) & -((ga__nanN > v.si) & (v.si > ga__infN)); + v.ui >>= ga__shift; // logical shift + v.si ^= ((v.si - ga__maxD) ^ v.si) & -(v.si > ga__maxC); + v.si ^= ((v.si - ga__minD) ^ v.si) & -(v.si > ga__subC); + return (uint16_t)(v.ui | sign); + +#undef ga__shift +#undef ga__shiftSign + +#undef ga__infN +#undef ga__maxN +#undef ga__minN +#undef ga__signN + +#undef ga__infC +#undef ga__nanN +#undef ga__maxC +#undef ga__minC + +#undef ga__mulN + +#undef ga__subC + +#undef ga__maxD +#undef ga__minD } #define ISSET(v, fl) ((v) & (fl)) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index c11e68309f..e58763761e 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -1,6 +1,5 @@ /* Includes */ #include -#include #include #include #include "integerfactoring.h" diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index a143a33850..4611ea4c87 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -5,7 +5,7 @@ /* Includes */ #include -#include +#include "gpuarray/config.h" #include "util/strb.h" From db9df087389976f002ac0ada8a41e1e2f315480a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 15 Nov 2016 18:54:31 -0500 Subject: [PATCH 089/597] Fix order of api numbers for libcublas. --- src/gpuarray_blas_cuda_cublas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 134c7438d3..1468edf76a 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -176,7 +176,7 @@ static int setup(gpucontext *c) { if (ctx->blas_handle != NULL) return GA_NO_ERROR; - e = load_libcublas(ctx->minor, ctx->major); + e = load_libcublas(ctx->major, ctx->minor); if (e != GA_NO_ERROR) return e; From 490db2861ec92a5c2dedf56b464d917613b4017a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 16 Nov 2016 15:03:40 -0500 Subject: [PATCH 090/597] Don't allow atomics in cuBLAS. --- src/gpuarray_blas_cuda_cublas.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index f756c20651..88c12f8299 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -197,7 +197,6 @@ static int setup(gpucontext *c) { } cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST); - cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED); types[0] = GA_BUFFER; types[1] = GA_SIZE; From 2c2da3aa000c47ea77c912fcdebc5aba9564e43e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 17 Nov 2016 10:21:31 -0500 Subject: [PATCH 091/597] Update the documentation to reflect the new compilation requirements. --- doc/installation.rst | 56 ++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 3d948aa57d..55ddda41d5 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -5,41 +5,40 @@ The library is routinely tested on OS X and linux and, less frequently, on Windows. The OS most frequently tested are: - Debian 6 - - Ubuntu 14.04 - - Mac OS X 10.11 + - Ubuntu 16.04 + - Mac OS X 10.12 - Windows 7 It should also work on any decently recent OS not listed here. If you get an error during the build on your favorite OS, please report it and we will attempt to fix it. -Requirements ------------- +Build Requirements +------------------ - cmake >= 3.0 (cmake_). - a c99-compliant compiler (or MSVC if on windows). - - (optional) CUDA >= 6.5 (cuda_). - - (optional) NVIDIA NCCL (nccl_). - - (optional) OpenCL runtime. - - (optional) clBLAS (clblas_). - (optional) libcheck (check_) to run the C tests. - (optional) python (python_) for the python bindings. - (optional) mako (mako_) for development or running the python bindings. - (optional) Cython >= 0.21 (cython_) for the python bindings. - (optional) nosetests (nosetests_) to run the python tests. -.. note:: - If you have neither an OpenCL runtime or a CUDA runtime, the - library might still build, but will be rather useless. +Run Requirements +---------------- -.. note:: - We support CUDA GPUs with `compute capability 2.0 (Fermi) - `_ and up. +No matter what was available at build time, this library comes with +dynamic loaders for the following library. You don't need to have any +of this available, but you won't be able to use associated +functionality. -.. note:: - In the case you want to build with collective operation support for CUDA, - you will need CUDA GPUs with `compute capability 3.0 (Kepler) - `_ and up plus CUDA >= 7. + * For CUDA: + - CUDA (cuda_) version 7.0 or more, with the appropriate driver + - (optional) NCCL (nccl_) for the collectives interface + + * For OpenCL: + - OpenCL version 1.1 or more + - (optional) clBLAS (_clblas) or CLBlast (_clblast) for blas functionality Download -------- @@ -125,18 +124,8 @@ can also reboot the machine to do that. Mac-specific instructions ------------------------- -To get the compiler you need to install Xcode which is available for -free from the App Store. Don't forget to install the command-line -tools afterwards. - -On Xcode 4.x these are installed by going to the download tab of the -preferences window and selecting the "Command-line Tools" download. - -If you have Xcode 5, ensure you update to 5.0.2 or later. Prior -versions will not look in /usr/local for includes or libraries and -this will cause a lot of errors. You can update by using the -"Software Update..." function of the Apple menu or by running -'xcode-select --install' on the command line. +The only supported compile is the clang version that comes with Xcode. +Select the appropriate version of Xcode for you version of macOS. It might be possible to use a version of gcc built using Homebrew or MacPorts, but this is untested and unsupported. @@ -177,9 +166,8 @@ Running Tests everything is ok even if you intend on just using the C library. To run the C tests, enter the build directory (the one where you ran -cmake) and run 'make test'. It will run using the first OpenCL and -the first CUDA device it finds skipping these if the corresponding -backend wasn't built. +cmake), select a target device by exporting DEVICE (or +GPUARRAY_TEST_DEVICE) and run 'make test'. If you get an error message similar to this one: @@ -215,6 +203,8 @@ you can confirm which device it is running on. .. _clblas: https://github.com/clMathLibraries/clBLAS +.. _clblast: https://github.com/CNugteren/CLBlast + .. _cuda: https://developer.nvidia.com/category/zone/cuda-zone .. _nccl: https://github.com/NVIDIA/nccl From e56091d802763e433523095afcc08f6e03e48432 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 17 Nov 2016 11:07:23 -0500 Subject: [PATCH 092/597] Fix typo. --- doc/installation.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 55ddda41d5..add2fbe675 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -6,7 +6,7 @@ frequently, on Windows. The OS most frequently tested are: - Debian 6 - Ubuntu 16.04 - - Mac OS X 10.12 + - macOS 10.12 - Windows 7 It should also work on any decently recent OS not listed here. If you @@ -124,8 +124,9 @@ can also reboot the machine to do that. Mac-specific instructions ------------------------- -The only supported compile is the clang version that comes with Xcode. -Select the appropriate version of Xcode for you version of macOS. +The only supported compiler is the clang version that comes with +Xcode. Select the appropriate version of Xcode for you version of +macOS. It might be possible to use a version of gcc built using Homebrew or MacPorts, but this is untested and unsupported. From 0783e8ea364ced96f37e765a71c5c5b4179cb675 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 17 Nov 2016 12:46:24 -0500 Subject: [PATCH 093/597] Remove extra spaces that were left. --- src/gpuarray_buffer_opencl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 2a6833649e..4b3f5fbfe2 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -132,18 +132,18 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { id = get_dev(ctx, NULL); if (id == NULL) return NULL; err = clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(qprop), - &qprop, NULL); + &qprop, NULL); if (err != CL_SUCCESS) return NULL; err = clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL); if (err != CL_SUCCESS) return NULL; err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, - NULL); + NULL); if (err != CL_SUCCESS) return NULL; err = clGetDeviceInfo(id, CL_DRIVER_VERSION, sizeof(driver_version), - driver_version, NULL); + driver_version, NULL); if (err != CL_SUCCESS) return NULL; @@ -242,7 +242,7 @@ gpudata *cl_make_buf(gpucontext *c, cl_mem buf) { ASSERT_CTX(ctx); ctx->err = clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx), - &buf_ctx, NULL); + &buf_ctx, NULL); if (ctx->err != CL_SUCCESS) return NULL; if (buf_ctx != ctx->ctx) return NULL; From 44806bf9d8b2174a4c311ddb66b0fcccdac0c906 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 17 Nov 2016 17:08:53 -0500 Subject: [PATCH 094/597] Raise exception to prevent issue #205. I suggest this PR to prevent the bug #205 where ielemwise2 does run when broadcast=True and output array has dimensions smaller than input array. I added a checking that raises an exception to avoid any case when the output array could be broadcasted, as suggested by davidweichiang on issue 205. I also added some tests related to this change: `DEVICE=cuda0 nosetests pygpu.tests.test_elemwise:test_ielemwise2_output_broadcast` @nouiz @abergeron --- pygpu/elemwise.py | 19 ++++++++++++++- pygpu/tests/test_elemwise.py | 47 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/pygpu/elemwise.py b/pygpu/elemwise.py index 348ee3fdd4..8b64ba9868 100644 --- a/pygpu/elemwise.py +++ b/pygpu/elemwise.py @@ -4,7 +4,7 @@ from . import gpuarray from ._elemwise import GpuElemwise, arg -__all__ = ['GpuElemwise', 'elemwise1', 'elemwise2', 'ielemwise2', 'compare'] +__all__ = ['GpuElemwise', 'elemwise1', 'elemwise2', 'ielemwise2', 'compare', 'BroadcastError'] def _dtype(o): @@ -76,12 +76,29 @@ def elemwise2(a, op, b, ary, odtype=None, oper=None, k(res, a, b, broadcast=broadcast) return res +class BroadcastError(ValueError): + pass def ielemwise2(a, op, b, oper=None, op_tmpl="a = a %(op)s b", broadcast=False, convert_f16=True): if not isinstance(b, gpuarray.GpuArray): b = numpy.asarray(b) + # We don't want to broadcast the output (a). + # So we raise an exception in any case when + # a could potentially be broadcasted to b. + if broadcast: + if a.ndim < b.ndim: + raise BroadcastError("output has less dimensions " + "than input (%d vs %d)" % + (a.ndim, b.ndim)) + elif a.ndim == b.ndim: + for i in range(a.ndim): + if a.shape[i] < b.shape[i]: + raise BroadcastError("The dimension %d/%d in output is " + "smaller than the corresponding one " + "in input" % (i+1, a.ndim)) + a_arg = as_argument(a, 'a', read=True, write=True) b_arg = as_argument(b, 'b', read=True) diff --git a/pygpu/tests/test_elemwise.py b/pygpu/tests/test_elemwise.py index d6ab88d9b3..f5cb8e2b0c 100644 --- a/pygpu/tests/test_elemwise.py +++ b/pygpu/tests/test_elemwise.py @@ -2,6 +2,7 @@ import numpy from pygpu import gpuarray, ndgpuarray as elemary +from pygpu.elemwise import BroadcastError from six import PY2 @@ -59,6 +60,39 @@ def test_ielemwise2_ops_array(): yield ielemwise2_ops_array, op, dtype1, dtype2, (50,) +def test_ielemwise2_output_broadcast(): + for shapea, shapeb in [((2, 5), (3, 2, 5)), + ((1, 4), (6, 4)), + ((2, 1, 8, 7), (2, 2, 8, 7))]: + yield ielemwise2_output_broadcast_should_fail, shapea, shapeb + for shapea, shapeb in [((2, 5),(2, 5)), + ((6, 4),(1, 4)), + ((2, 2, 8, 7), (2, 1, 8, 7))]: + yield ielemwise2_output_broadcast_should_pass, shapea, shapeb + + +def ielemwise2_output_broadcast_should_fail(shapea, shapeb): + try: + ielemwise2_output_broadcast(shapea, shapeb) + except BroadcastError: + pass + except Exception as e: + # We must have a BroadcastError first, nothing else. + raise Exception("ielemwise2 should raise a BroadcastError " + "with shapes %s and %s." % (shapea, shapeb)) + else: + # We must have a BroadcastError, otherwise something's wrong. + raise Exception("ielemwise2 should raise a BroadcastError " + "with shapes %s and %s." % (shapea, shapeb)) + +def ielemwise2_output_broadcast_should_pass(shapea, shapeb): + try: + ielemwise2_output_broadcast(shapea, shapeb) + except Exception: + print ("Exception raised with shapes:", shapea, shapeb) + raise + + @guard_devsup def elemwise2_ops_array(op, dtype1, dtype2, shape): ac, ag = gen_gpuarray(shape, dtype1, ctx=context, cls=elemary) @@ -95,6 +129,19 @@ def ielemwise2_ops_array(op, dtype1, dtype2, shape): assert numpy.allclose(out_c, numpy.asarray(out_g), atol=1e-6) +@guard_devsup +def ielemwise2_output_broadcast(shapea, shapeb): + na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) + nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) + out_g = operator.iadd(ga, gb) + try: + out_n = operator.iadd(na, nb) + except TypeError: + return + assert out_g is ga + assert numpy.allclose(out_n, numpy.asarray(out_g), atol=1e-6) + + def test_elemwise_f16(): yield elemwise1_ops_array, operator.neg, 'float16' yield elemwise2_ops_array, operator.add, 'float16', 'float16', (50,) From 27ce191bd2a40dde5972b9ac0eee9ef6a6ae3fa0 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 21 Nov 2016 11:43:48 -0500 Subject: [PATCH 095/597] Move checking to C code in function check_basic(). Simplify tests. --- pygpu/elemwise.py | 19 +----------- pygpu/tests/test_elemwise.py | 59 +++++++++--------------------------- src/gpuarray_elemwise.c | 33 +++++++++++++++++++- 3 files changed, 48 insertions(+), 63 deletions(-) diff --git a/pygpu/elemwise.py b/pygpu/elemwise.py index 8b64ba9868..348ee3fdd4 100644 --- a/pygpu/elemwise.py +++ b/pygpu/elemwise.py @@ -4,7 +4,7 @@ from . import gpuarray from ._elemwise import GpuElemwise, arg -__all__ = ['GpuElemwise', 'elemwise1', 'elemwise2', 'ielemwise2', 'compare', 'BroadcastError'] +__all__ = ['GpuElemwise', 'elemwise1', 'elemwise2', 'ielemwise2', 'compare'] def _dtype(o): @@ -76,29 +76,12 @@ def elemwise2(a, op, b, ary, odtype=None, oper=None, k(res, a, b, broadcast=broadcast) return res -class BroadcastError(ValueError): - pass def ielemwise2(a, op, b, oper=None, op_tmpl="a = a %(op)s b", broadcast=False, convert_f16=True): if not isinstance(b, gpuarray.GpuArray): b = numpy.asarray(b) - # We don't want to broadcast the output (a). - # So we raise an exception in any case when - # a could potentially be broadcasted to b. - if broadcast: - if a.ndim < b.ndim: - raise BroadcastError("output has less dimensions " - "than input (%d vs %d)" % - (a.ndim, b.ndim)) - elif a.ndim == b.ndim: - for i in range(a.ndim): - if a.shape[i] < b.shape[i]: - raise BroadcastError("The dimension %d/%d in output is " - "smaller than the corresponding one " - "in input" % (i+1, a.ndim)) - a_arg = as_argument(a, 'a', read=True, write=True) b_arg = as_argument(b, 'b', read=True) diff --git a/pygpu/tests/test_elemwise.py b/pygpu/tests/test_elemwise.py index f5cb8e2b0c..a1c752fa7f 100644 --- a/pygpu/tests/test_elemwise.py +++ b/pygpu/tests/test_elemwise.py @@ -1,8 +1,9 @@ import operator import numpy +from unittest import TestCase from pygpu import gpuarray, ndgpuarray as elemary -from pygpu.elemwise import BroadcastError +from pygpu.elemwise import ielemwise2 from six import PY2 @@ -60,37 +61,20 @@ def test_ielemwise2_ops_array(): yield ielemwise2_ops_array, op, dtype1, dtype2, (50,) -def test_ielemwise2_output_broadcast(): - for shapea, shapeb in [((2, 5), (3, 2, 5)), - ((1, 4), (6, 4)), - ((2, 1, 8, 7), (2, 2, 8, 7))]: - yield ielemwise2_output_broadcast_should_fail, shapea, shapeb - for shapea, shapeb in [((2, 5),(2, 5)), - ((6, 4),(1, 4)), - ((2, 2, 8, 7), (2, 1, 8, 7))]: - yield ielemwise2_output_broadcast_should_pass, shapea, shapeb +class test_elemwise_rw_args_not_broadcasted(TestCase): + def test(self): + for shapea, shapeb in [((1, 4), (6, 4)), ((2, 1, 8, 7), (2, 2, 8, 7))]: + self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb) + for shapea, shapeb in [((6, 4), (1, 4)), ((2, 2, 8, 7), (2, 1, 8, 7))]: + self.run_ielemwise2(shapea, shapeb) - -def ielemwise2_output_broadcast_should_fail(shapea, shapeb): - try: - ielemwise2_output_broadcast(shapea, shapeb) - except BroadcastError: - pass - except Exception as e: - # We must have a BroadcastError first, nothing else. - raise Exception("ielemwise2 should raise a BroadcastError " - "with shapes %s and %s." % (shapea, shapeb)) - else: - # We must have a BroadcastError, otherwise something's wrong. - raise Exception("ielemwise2 should raise a BroadcastError " - "with shapes %s and %s." % (shapea, shapeb)) - -def ielemwise2_output_broadcast_should_pass(shapea, shapeb): - try: - ielemwise2_output_broadcast(shapea, shapeb) - except Exception: - print ("Exception raised with shapes:", shapea, shapeb) - raise + @guard_devsup + def run_ielemwise2(self, shapea, shapeb): + na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) + nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) + ielemwise2(ga, '+', gb, broadcast=True) + na += nb + assert numpy.allclose(na, numpy.asarray(ga), atol=1e-6) @guard_devsup @@ -129,19 +113,6 @@ def ielemwise2_ops_array(op, dtype1, dtype2, shape): assert numpy.allclose(out_c, numpy.asarray(out_g), atol=1e-6) -@guard_devsup -def ielemwise2_output_broadcast(shapea, shapeb): - na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) - nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) - out_g = operator.iadd(ga, gb) - try: - out_n = operator.iadd(na, nb) - except TypeError: - return - assert out_g is ga - assert numpy.allclose(out_n, numpy.asarray(out_g), atol=1e-6) - - def test_elemwise_f16(): yield elemwise1_ops_array, operator.neg, 'float16' yield elemwise2_ops_array, operator.add, 'float16', 'float16', (50,) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 14a1db5556..e48c7c7d9e 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -31,6 +31,7 @@ struct _GpuElemwise { STATIC_ASSERT(GEN_CONVERT_F16 == GE_CONVERT_F16, same_flags_value_elem1); #define is_array(a) (ISCLR((a).flags, GE_SCALAR)) +#define is_rw_array(a) (ISSET((a).flags, GE_READ) && ISSET((a).flags, GE_WRITE)) static inline int k_initialized(GpuKernel *k) { return k->k != NULL; @@ -274,6 +275,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, GpuArray *a = NULL, *v; unsigned int i, j, p, num_arrays = 0, nd = 0, nnd; int call32 = 1; + size_t read_write_arrays_found; /* Go through the list and grab some info */ for (i = 0; i < ge->n; i++) { @@ -321,20 +323,49 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, n = 1; for (j = 0; j < nd; j++) { p = 0; + read_write_arrays_found = 0; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { v = (GpuArray *)args[i]; - if (ge->dims[j] != v->dimensions[j]) { + if (ge->dims[j] == v->dimensions[j]) { + /* We count the number of read-write arrays found until now + * that are not broadcasted. */ + if (is_rw_array(ge->args[i])) + ++read_write_arrays_found; + } else { if (ISCLR(flags, GE_BROADCAST)) { return GA_VALUE_ERROR; } /* GE_BROADCAST is set */ if (ge->dims[j] == 1) { + if (read_write_arrays_found) { + /* There are read-write arrays before the current array, + * and their (j+1)th dimension equals 1, so they would be + * broadcasted. We don't want that. */ + #ifdef DEBUG + fprintf(stderr, "\r\n(check_basic(): read-write arrays should not be broadcasted) "); + #endif + return GA_VALUE_ERROR; + } + /* There are no read-write arrays before the current array. + * So broadcasting can be done safely. */ ge->dims[j] = v->dimensions[j]; + /* We still count the current array if it's a read-write array + * (useless in the current implementation, but coherent). */ + if (is_rw_array(ge->args[i])) + ++read_write_arrays_found; } else { if (v->dimensions[j] != 1) { return GA_VALUE_ERROR; } + /* If the current array is a read-write array, + * we don't want it to be broadcasted. */ + if (is_rw_array(ge->args[i])) { + #ifdef DEBUG + fprintf(stderr, "\r\n(check_basic(): a read-write array should not be broadcasted) "); + #endif + return GA_VALUE_ERROR; + } } } /* If the dimension is 1 set the strides to 0 regardless since From 3e8c348f4787619f3bf2773b2408efa2d85ea163 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 21 Nov 2016 16:56:37 -0500 Subject: [PATCH 096/597] Now check write arguments (and not read-write only args as before). check_elemwise also extended with a few lines to add testing. --- src/gpuarray_elemwise.c | 43 ++++++++++++++++------------------------- tests/check_elemwise.c | 10 ++++++++-- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index e48c7c7d9e..3d040ced64 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -31,7 +31,7 @@ struct _GpuElemwise { STATIC_ASSERT(GEN_CONVERT_F16 == GE_CONVERT_F16, same_flags_value_elem1); #define is_array(a) (ISCLR((a).flags, GE_SCALAR)) -#define is_rw_array(a) (ISSET((a).flags, GE_READ) && ISSET((a).flags, GE_WRITE)) +#define is_output(a) (ISSET((a).flags, GE_WRITE)) static inline int k_initialized(GpuKernel *k) { return k->k != NULL; @@ -275,7 +275,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, GpuArray *a = NULL, *v; unsigned int i, j, p, num_arrays = 0, nd = 0, nnd; int call32 = 1; - size_t read_write_arrays_found; + int output_ever_found; /* Go through the list and grab some info */ for (i = 0; i < ge->n; i++) { @@ -323,49 +323,40 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, n = 1; for (j = 0; j < nd; j++) { p = 0; - read_write_arrays_found = 0; + output_ever_found = 0; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { v = (GpuArray *)args[i]; if (ge->dims[j] == v->dimensions[j]) { - /* We count the number of read-write arrays found until now - * that are not broadcasted. */ - if (is_rw_array(ge->args[i])) - ++read_write_arrays_found; + /* We check if this array is an output. */ + output_ever_found = output_ever_found || is_output(ge->args[i]); } else { if (ISCLR(flags, GE_BROADCAST)) { return GA_VALUE_ERROR; } /* GE_BROADCAST is set */ if (ge->dims[j] == 1) { - if (read_write_arrays_found) { - /* There are read-write arrays before the current array, + if (output_ever_found) { + /* There are outputs before the current array, * and their (j+1)th dimension equals 1, so they would be * broadcasted. We don't want that. */ #ifdef DEBUG - fprintf(stderr, "\r\n(check_basic(): read-write arrays should not be broadcasted) "); + fprintf(stderr, " (check_basic(): outputs should not be broadcasted) "); #endif return GA_VALUE_ERROR; } - /* There are no read-write arrays before the current array. + /* There are no outputs before the current array. * So broadcasting can be done safely. */ ge->dims[j] = v->dimensions[j]; - /* We still count the current array if it's a read-write array + /* We still check if the current array is an output * (useless in the current implementation, but coherent). */ - if (is_rw_array(ge->args[i])) - ++read_write_arrays_found; - } else { - if (v->dimensions[j] != 1) { - return GA_VALUE_ERROR; - } - /* If the current array is a read-write array, - * we don't want it to be broadcasted. */ - if (is_rw_array(ge->args[i])) { - #ifdef DEBUG - fprintf(stderr, "\r\n(check_basic(): a read-write array should not be broadcasted) "); - #endif - return GA_VALUE_ERROR; - } + output_ever_found = output_ever_found || is_output(ge->args[i]); + } else if (v->dimensions[j] != 1 || is_output(ge->args[i])) { + #ifdef DEBUG + if (is_output(ge->args[i])) + fprintf(stderr, " (check_basic(): an output should not be broadcasted) "); + #endif + return GA_VALUE_ERROR; } } /* If the dimension is 1 set the strides to 0 regardless since diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index d8893b8496..70cbd4b5a8 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -452,8 +452,8 @@ START_TEST(test_basic_broadcast) { ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); - dims[0] = 2; - dims[1] = 3; + dims[0] = 1; + dims[1] = 6; ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); @@ -479,6 +479,12 @@ START_TEST(test_basic_broadcast) { ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE), GA_VALUE_ERROR); + ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST), GA_VALUE_ERROR); + + dims[0] = 2; + dims[1] = 3; + + ga_assert_ok(GpuArray_reshape_inplace(&c, 2, dims, GA_ANY_ORDER)); ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); From a4e1b1bbed79d56ec1e0f1a43e4a54b56e7c6b69 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 22 Nov 2016 17:27:47 -0500 Subject: [PATCH 097/597] Remove the hackish API version number and add a proper SOVERSION. --- pygpu/gpuarray.pxd | 2 -- pygpu/gpuarray.pyx | 10 ---------- src/CMakeLists.txt | 1 + src/gpuarray/util.h | 3 --- src/gpuarray_util.c | 3 --- 5 files changed, 1 insertion(+), 18 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index c22504d4f3..0ebbb5d6f3 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -49,8 +49,6 @@ cdef extern from "gpuarray/types.h": GA_NBASE cdef extern from "gpuarray/util.h": - const int gpuarray_api_major - const int gpuarray_api_minor int gpuarray_register_type(gpuarray_type *t, int *ret) size_t gpuarray_get_elsize(int typecode) gpuarray_type *gpuarray_get_type(int typecode) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 8e45f1f227..a25c11c791 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -9,10 +9,6 @@ import numpy as np from cpython cimport Py_INCREF, PyNumber_Index from cpython.object cimport Py_EQ, Py_NE -def api_version(): - # major, minor, py - return (gpuarray_api_major, gpuarray_api_minor, 0) - np.import_array() # to export the numeric value @@ -564,12 +560,6 @@ def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): are no gaps in the valid numbers. """ cdef int flags = 0 - expected_version = -9997 - if gpuarray_api_major != expected_version or gpuarray_api_minor < 0: - raise RuntimeError( - "Pygpu was expecting libgpuarray version %d, but %d is available. " - "Recompile it to avoid problems.", - expected_version, gpuarray_api_major) if sched == 'single': flags |= GA_CTX_SINGLE_THREAD elif sched == 'multi': diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 624a856a91..a3f0dd27ef 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -87,6 +87,7 @@ set_target_properties(gpuarray PROPERTIES COMPILE_FLAGS "-DGPUARRAY_BUILDING_DLL -DGPUARRAY_SHARED" INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF + VERSION 0.0 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) diff --git a/src/gpuarray/util.h b/src/gpuarray/util.h index e92919b538..04761ab37c 100644 --- a/src/gpuarray/util.h +++ b/src/gpuarray/util.h @@ -15,9 +15,6 @@ extern "C" { #include #include -extern GPUARRAY_PUBLIC const int gpuarray_api_major; -extern GPUARRAY_PUBLIC const int gpuarray_api_minor; - /** * Registers a type with the kernel machinery. * diff --git a/src/gpuarray_util.c b/src/gpuarray_util.c index d0e134a592..5b2ccc2797 100644 --- a/src/gpuarray_util.c +++ b/src/gpuarray_util.c @@ -13,9 +13,6 @@ * phase. Once we go stable, this will move to 0 and go up from * there. */ -const int gpuarray_api_major = -9997; -const int gpuarray_api_minor = 1; - static gpuarray_type **custom_types = NULL; static int n_types = 0; static gpuarray_type no_type = {NULL, 0, 0, -1}; From c9552a68efd77b1190b9754563bf79aa403ea719 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 22 Nov 2016 17:48:19 -0500 Subject: [PATCH 098/597] Add a backward compat api_version() method. --- pygpu/gpuarray.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index a25c11c791..c39ac449bf 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -9,6 +9,10 @@ import numpy as np from cpython cimport Py_INCREF, PyNumber_Index from cpython.object cimport Py_EQ, Py_NE +def api_version(): + # Those where the last defined numbers. + return (-9997, 1, 0) + np.import_array() # to export the numeric value From 6e184574c58d7ffe084d2e24248ce39f0b8f0a08 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 23 Nov 2016 14:36:46 -0500 Subject: [PATCH 099/597] Allow writing to strides in python. --- pygpu/gpuarray.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index c39ac449bf..a2c35b815d 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1937,6 +1937,13 @@ cdef class GpuArray: res[i] = self.ga.strides[i] return tuple(res) + def __set__(self, newstrides): + cdef unsigned int i + if len(newstrides) != self.ga.nd: + raise ValueError("new strides are the wrong length") + for i in range(self.ga.nd): + self.ga.strides[i] = newstrides[i] + property ndim: "The number of dimensions in this object" def __get__(self): From 3c55c60fb9c385bb95ce74b1546b6e70a7870ae3 Mon Sep 17 00:00:00 2001 From: khaotik Date: Wed, 23 Nov 2016 14:42:06 -0500 Subject: [PATCH 100/597] added error message for CUDA JIT linking --- src/gpuarray_buffer_cuda.c | 56 ++++++++++++++++++++++++++++++++++---- src/loaders/libcuda.fn | 3 +- src/loaders/libcuda.h | 22 +++++++++++++++ 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index e90983929a..e393ab5512 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -984,6 +984,22 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, CUdevice dev; unsigned int i; int major, minor; + strb debug_msg = STRB_STATIC_INIT; + + // options for cuModuleLoadDataEx + const size_t cujit_log_size = 4096; + char *cujit_info_log = NULL; + unsigned int num_cujit_opts = 4; + CUjit_option cujit_opts[] = { + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER + }; + void *cujit_opt_vals[] = { + (void*)(size_t)cujit_log_size, NULL, + (void*)(size_t)cujit_log_size, NULL, + }; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); @@ -1069,10 +1085,9 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, &log, &log_len, ret); if (bin == NULL) { if (err_str != NULL) { - strb debug_msg = STRB_STATIC_INIT; // We're substituting debug_msg for a string with this first line: - strb_appends(&debug_msg, "CUDA kernel build failure ::\n"); + strb_appends(&debug_msg, "CUDA kernel compile failure ::\n"); /* Delete the final NUL */ sb.l--; @@ -1089,7 +1104,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, } strb_clear(&sb); cuda_exit(ctx); - return NULL; + FAIL(NULL, GA_IMPL_ERROR); } } @@ -1122,15 +1137,46 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, FAIL(NULL, GA_MEMORY_ERROR); } - ctx->err = cuModuleLoadData(&res->m, bin); + // for both info/err log + cujit_info_log = (char*)malloc(2*cujit_log_size*sizeof(char)); + if(cujit_info_log == NULL) { + _cuda_freekernel(res); + strb_clear(&sb); + cuda_exit(ctx); + FAIL(NULL, GA_MEMORY_ERROR); + } + cujit_info_log[0] = 0; + cujit_info_log[cujit_log_size] = 0; + cujit_opt_vals[1] = (void*)cujit_info_log; + cujit_opt_vals[3] = (void*)(cujit_info_log+cujit_log_size); + + ctx->err = cuModuleLoadDataEx( + &res->m, bin, + num_cujit_opts, cujit_opts, (void**)cujit_opt_vals); if (ctx->err != CUDA_SUCCESS) { + if (err_str != NULL) { + strb_appends(&debug_msg, "CUDA kernel link failure::\n"); + if (cujit_info_log[0]) { + strb_appends(&debug_msg, "\nLinker msg:\n"); + strb_appends(&debug_msg, cujit_info_log); + } + if (cujit_info_log[cujit_log_size]) { + strb_appends(&debug_msg, "\nLinker error log:\n"); + strb_appends(&debug_msg, cujit_info_log+cujit_log_size); + } + strb_append0(&debug_msg); + *err_str = strb_cstr(&debug_msg); + } + free(cujit_info_log); _cuda_freekernel(res); strb_clear(&sb); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } + free(cujit_info_log); + ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); @@ -1352,7 +1398,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, *((char **)res) = s; cuda_exit(ctx); return GA_NO_ERROR; - + case GA_CTX_PROP_PCIBUSID: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn index 4a22853442..487706f4f3 100644 --- a/src/loaders/libcuda.fn +++ b/src/loaders/libcuda.fn @@ -18,6 +18,7 @@ DEF_PROC_V2(cuCtxPushCurrent, (CUcontext ctx)); DEF_PROC_V2(cuCtxPopCurrent, (CUcontext *pctx)); DEF_PROC(cuModuleLoadData, (CUmodule *module, const void *image)); +DEF_PROC(cuModuleLoadDataEx, (CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)); DEF_PROC(cuModuleUnload, (CUmodule hmod)); DEF_PROC(cuModuleGetFunction, (CUfunction *hfunc, CUmodule hmod, const char *name)); @@ -50,4 +51,4 @@ DEF_PROC_V2(cuStreamDestroy, (CUstream hStream)); DEF_PROC(cuIpcGetMemHandle, (CUipcMemHandle *pHandle, CUdeviceptr dptr)); DEF_PROC(cuIpcOpenMemHandle, (CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags)); -DEF_PROC(cuIpcCloseMemHandle, (CUdeviceptr dptr)); \ No newline at end of file +DEF_PROC(cuIpcCloseMemHandle, (CUdeviceptr dptr)); diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h index 865d86b560..e62f8b85df 100644 --- a/src/loaders/libcuda.h +++ b/src/loaders/libcuda.h @@ -29,6 +29,7 @@ typedef enum CUfunction_attribute_enum CUfunction_attribute; typedef enum CUevent_flags_enum CUevent_flags; typedef enum CUctx_flags_enum CUctx_flags; typedef enum CUipcMem_flags_enum CUipcMem_flags; +typedef enum CUjit_option_enum CUjit_option; #define CU_IPC_HANDLE_SIZE 64 @@ -184,4 +185,25 @@ enum CUipcMem_flags_enum { CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 }; +enum CUjit_option_enum { + CU_JIT_MAX_REGISTERS = 0, + CU_JIT_THREADS_PER_BLOCK, + CU_JIT_WALL_TIME, + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_OPTIMIZATION_LEVEL, + CU_JIT_TARGET_FROM_CUCONTEXT, + CU_JIT_TARGET, + CU_JIT_FALLBACK_STRATEGY, + CU_JIT_GENERATE_DEBUG_INFO, + CU_JIT_LOG_VERBOSE, + CU_JIT_GENERATE_LINE_INFO, + CU_JIT_CACHE_MODE, + CU_JIT_NEW_SM3X_OPT, + CU_JIT_FAST_COMPILE, + CU_JIT_NUM_OPTIONS +}; + #endif From 690827dd62f19b61f7bba76fe6559b7edbb702d4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 23 Nov 2016 14:51:49 -0500 Subject: [PATCH 101/597] Add a comment to clarify that VERSION is the shared library version. --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a3f0dd27ef..622146508c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -87,6 +87,7 @@ set_target_properties(gpuarray PROPERTIES COMPILE_FLAGS "-DGPUARRAY_BUILDING_DLL -DGPUARRAY_SHARED" INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF + # This is the shared library version VERSION 0.0 ) From 2a781778cf03f5efac8f57eb3bc88a8168344409 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 23 Nov 2016 16:07:13 -0500 Subject: [PATCH 102/597] Add function to fix the flags of a GpuArray. --- src/gpuarray/array.h | 7 ++++++ src/gpuarray_array.c | 56 ++++++++++++++------------------------------ 2 files changed, 24 insertions(+), 39 deletions(-) diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index aabf14ee02..966290be1d 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -196,6 +196,13 @@ static inline int GpuArray_CHKFLAGS(const GpuArray *a, int flags) { */ #define GpuArray_ITEMSIZE(a) gpuarray_get_elsize((a)->typecode) +/** + * Fix the flags of an array using the current strides and shape. + * + * \param a GpuArray to fix flags for + */ +GPUARRAY_PUBLIC void GpuArray_fix_flags(GpuArray *a); + /** * Initialize and allocate a new empty (uninitialized data) array. * diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 07e3a7fd9e..3e33f58719 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -85,9 +85,17 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { /* Value below which a size_t multiplication will never overflow. */ #define MUL_NO_OVERFLOW (1UL << (sizeof(size_t) * 4)) -int GpuArray_empty(GpuArray *a, gpucontext *ctx, - int typecode, unsigned int nd, const size_t *dims, - ga_order ord) { +void GpuArray_fix_flags(GpuArray *a) { + /* Only keep the writable flag */ + a->flags &= GA_WRITEABLE; + /* Set the other flags if applicable */ + if (GpuArray_is_c_contiguous(a)) a->flags |= GA_C_CONTIGUOUS; + if (GpuArray_is_f_contiguous(a)) a->flags |= GA_F_CONTIGUOUS; + if (GpuArray_is_aligned(a)) a->flags |= GA_ALIGNED; +} + +int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, + unsigned int nd, const size_t *dims, ga_order ord) { size_t size = gpuarray_get_elsize(typecode); unsigned int i; int res = GA_NO_ERROR; @@ -185,9 +193,7 @@ int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, memcpy(a->dimensions, dims, nd*sizeof(size_t)); memcpy(a->strides, strides, nd*sizeof(ssize_t)); - if (GpuArray_is_c_contiguous(a)) a->flags |= GA_C_CONTIGUOUS; - if (GpuArray_is_f_contiguous(a)) a->flags |= GA_F_CONTIGUOUS; - if (GpuArray_is_aligned(a)) a->flags |= GA_ALIGNED; + GpuArray_fix_flags(a); return GA_NO_ERROR; } @@ -304,18 +310,7 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, a->dimensions = newdims; free(a->strides); a->strides = newstrs; - if (GpuArray_is_c_contiguous(a)) - a->flags |= GA_C_CONTIGUOUS; - else - a->flags &= ~GA_C_CONTIGUOUS; - if (GpuArray_is_f_contiguous(a)) - a->flags |= GA_F_CONTIGUOUS; - else - a->flags &= ~GA_F_CONTIGUOUS; - if (GpuArray_is_aligned(a)) - a->flags |= GA_ALIGNED; - else - a->flags &= ~GA_ALIGNED; + GpuArray_fix_flags(a); return GA_NO_ERROR; } @@ -582,9 +577,8 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) { tv.nd = a->nd; tv.dimensions = a->dimensions; tv.strides = strs; - /* This could be optiomized by setting the right flags */ if (tv.nd != 0) - tv.flags &= ~(GA_C_CONTIGUOUS|GA_F_CONTIGUOUS); + GpuArray_fix_flags(&tv); err = ga_extcopy(a, &tv); free(strs); return err; @@ -745,18 +739,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, a->strides = newstrides; fix_flags: - if (GpuArray_is_c_contiguous(a)) - a->flags |= GA_C_CONTIGUOUS; - else - a->flags &= ~GA_C_CONTIGUOUS; - if (GpuArray_is_f_contiguous(a)) - a->flags |= GA_F_CONTIGUOUS; - else - a->flags &= ~GA_F_CONTIGUOUS; - if (GpuArray_is_aligned(a)) - a->flags |= GA_ALIGNED; - else - a->flags &= ~GA_ALIGNED; + GpuArray_fix_flags(a); return GA_NO_ERROR; } @@ -808,11 +791,7 @@ int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) { a->dimensions = newdims; a->strides = newstrs; - a->flags &= ~(GA_C_CONTIGUOUS|GA_F_CONTIGUOUS); - if (GpuArray_is_c_contiguous(a)) - a->flags |= GA_C_CONTIGUOUS; - if (GpuArray_is_f_contiguous(a)) - a->flags |= GA_F_CONTIGUOUS; + GpuArray_fix_flags(a); return GA_NO_ERROR; } @@ -1016,10 +995,9 @@ int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n, res_off = r->offset; res_dims = r->dimensions; res_flags = r->flags; - /* This could be optimized by setting the right flags */ - r->flags &= ~(GA_C_CONTIGUOUS|GA_F_CONTIGUOUS); for (i = 0; i < n; i++) { r->dimensions = as[i]->dimensions; + GpuArray_fix_flags(r); err = ga_extcopy(r, as[i]); if (err != GA_NO_ERROR) { r->dimensions = res_dims; From 9b0d328e01095f3e040b4bde54ffd27a6acce9d4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 23 Nov 2016 16:11:13 -0500 Subject: [PATCH 103/597] Wrap GpuArray_fix_flags and use it. --- pygpu/gpuarray.pxd | 1 + pygpu/gpuarray.pyx | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 0ebbb5d6f3..34b9f11836 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -149,6 +149,7 @@ cdef extern from "gpuarray/array.h": ctypedef enum ga_order: GA_ANY_ORDER, GA_C_ORDER, GA_F_ORDER + void GpuArray_fix_flags(_GpuArray *a) int GpuArray_empty(_GpuArray *a, gpucontext *ctx, int typecode, int nd, const size_t *dims, ga_order ord) int GpuArray_fromdata(_GpuArray *a, diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index a2c35b815d..79760d0dea 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -234,6 +234,9 @@ cdef bint py_CHKFLAGS(GpuArray a, int flags): cdef bint py_ISONESEGMENT(GpuArray a): return GpuArray_ISONESEGMENT(&a.ga) +cdef void array_fix_flags(GpuArray a): + GpuArray_fix_flags(&a.ga) + cdef int array_empty(GpuArray a, gpucontext *ctx, int typecode, unsigned int nd, const size_t *dims, ga_order ord) except -1: @@ -1943,6 +1946,7 @@ cdef class GpuArray: raise ValueError("new strides are the wrong length") for i in range(self.ga.nd): self.ga.strides[i] = newstrides[i] + array_fix_flags(self) property ndim: "The number of dimensions in this object" From e9b39d8050d0c6cdded4663157e433638051d3b2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 23 Nov 2016 16:35:01 -0500 Subject: [PATCH 104/597] Fix cuda_init returning NULL with no error set. --- src/gpuarray_buffer_cuda.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index e90983929a..394e365dd0 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -399,8 +399,9 @@ static gpucontext *cuda_init(int ord, int flags, int *ret) { int r; r = setup_lib(); - if (r != GA_NO_ERROR) - return NULL; + if (r != GA_NO_ERROR) { + FAIL(NULL, r); + } if (ord == -1) { int i, c; From 8121c2e668dbd0ab7056c2db37ad2315f2c6eb2b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 24 Nov 2016 15:19:21 -0500 Subject: [PATCH 105/597] Add a check that new strides don't go outside allocated memory and add some tests. --- pygpu/gpuarray.pxd | 4 ++++ pygpu/gpuarray.pyx | 28 ++++++++++++++++++++++++++++ pygpu/tests/test_gpu_ndarray.py | 27 +++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 34b9f11836..c205d8f484 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -73,6 +73,7 @@ cdef extern from "gpuarray/buffer.h": gpucontext *gpucontext_init(const char *name, int devno, int flags, int *ret) void gpucontext_deref(gpucontext *ctx) char *gpucontext_error(gpucontext *ctx, int err) + int gpudata_property(gpudata *ctx, int prop_id, void *res) int gpucontext_property(gpucontext *ctx, int prop_id, void *res) int gpukernel_property(gpukernel *k, int prop_id, void *res) gpucontext *gpudata_context(gpudata *) @@ -99,6 +100,9 @@ cdef extern from "gpuarray/buffer.h": int GA_CTX_PROP_MAXGSIZE0 int GA_CTX_PROP_MAXGSIZE1 int GA_CTX_PROP_MAXGSIZE2 + + int GA_BUFFER_PROP_SIZE + int GA_KERNEL_PROP_MAXLSIZE int GA_KERNEL_PROP_PREFLSIZE int GA_KERNEL_PROP_NUMARGS diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 79760d0dea..060636c8a7 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -212,6 +212,32 @@ cdef ga_order to_ga_order(ord) except -2: else: raise ValueError, "Valid orders are: 'A' (any), 'C' (C), 'F' (Fortran)" +cdef int strides_ok(GpuArray a, strides): + cdef ssize_t max_axis_offset + cdef size_t lower = a.ga.offset + cdef size_t upper = a.ga.offset + cdef size_t itemsize = gpuarray_get_elsize(a.ga.typecode) + cdef size_t size + cdef unsigned int i + + gpudata_property(a.ga.data, GA_BUFFER_PROP_SIZE, &size) + + for i in range(a.ga.nd): + if a.ga.dimensions[i] == 0: + return 1 + + max_axis_offset = strides[i] * (a.ga.dimensions[i] - 1) + if max_axis_offset > 0: + if upper + max_axis_offset > size: + return 0 + upper += max_axis_offset + else: + if lower < -max_axis_offset: + return 0 + lower += max_axis_offset + return (upper + itemsize) <= size + + class GpuArrayException(Exception): """ Exception used for most errors related to libgpuarray. @@ -1944,6 +1970,8 @@ cdef class GpuArray: cdef unsigned int i if len(newstrides) != self.ga.nd: raise ValueError("new strides are the wrong length") + if not strides_ok(self, newstrides): + raise ValueError("new strides go outside of allocated memory") for i in range(self.ga.nd): self.ga.strides[i] = newstrides[i] array_fix_flags(self) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index 6685fc8274..34222b4e37 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -8,6 +8,7 @@ import numpy +from nose.tools import assert_raises import pygpu from pygpu.gpuarray import GpuArray, GpuContext, GpuKernel @@ -448,6 +449,32 @@ def reshape(shps, offseted, order1, order2): assert numpy.allclose(outc, numpy.asarray(outg)) +def test_strides(): + yield strides_, (4, 4), 'c', 1, (4, 4) + yield strides_, (4, 4), 'c', 1, (4, 16) + yield strides_, (4, 4), 'c', 1, (16, 4) + yield strides_, (4, 4), 'c', 1, (16, 8) + yield strides_, (4, 4), 'c', 1, (16, 0) + yield strides_, (4, 4), 'c', -1, (-20, 4) + yield strides_, (4, 4), 'c', -1, (-12, 4) + + +def set_strides(a, newstr): + a.strides = newstr + + +def strides_(shp, order, sliced, newstr): + ac, ag = gen_gpuarray(shp, 'float32', sliced=sliced, order=order, ctx=ctx) + try: + ac.strides = newstr + except ValueError: + assert_raises(ValueError, set_strides, ag, newstr) + return + ag.strides = newstr + check_flags(ag, ac) + assert numpy.allclose(ac, numpy.asarray(ag)) + + def test_transpose(): for shp in [(2, 3), (4, 8, 9), (1, 2, 3, 4)]: for offseted in [True, False]: From cda0d46070ea50ceba78a3125ed3765280a091ce Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 24 Nov 2016 16:37:56 -0500 Subject: [PATCH 106/597] Add documentation for strides_ok(). --- pygpu/gpuarray.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 060636c8a7..8deb2d279a 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -213,6 +213,9 @@ cdef ga_order to_ga_order(ord) except -2: raise ValueError, "Valid orders are: 'A' (any), 'C' (C), 'F' (Fortran)" cdef int strides_ok(GpuArray a, strides): + # Check that the passed in strides will not go outside of the + # memory of the array. It is assumed that the strides are of the + # proper length. cdef ssize_t max_axis_offset cdef size_t lower = a.ga.offset cdef size_t upper = a.ga.offset From c95c9ca65653cf8f187786b6527559fcaeaf6cb6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 24 Nov 2016 17:36:24 -0500 Subject: [PATCH 107/597] Simplify code to check for broadcastable outputs. --- src/gpuarray_elemwise.c | 46 +++++++++-------------------------------- 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 3d040ced64..6037978f6f 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -275,13 +275,12 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, GpuArray *a = NULL, *v; unsigned int i, j, p, num_arrays = 0, nd = 0, nnd; int call32 = 1; - int output_ever_found; /* Go through the list and grab some info */ for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { num_arrays++; - if (a == NULL) { + if (a == NULL || !is_output(a)) { a = (GpuArray *)args[i]; nd = a->nd; } @@ -290,7 +289,8 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, } } - if (a == NULL) + /* No output arrays, this is an error */ + if (a == NULL || !is_output(a)) return GA_VALUE_ERROR; /* Check if we need to grow the internal buffers */ @@ -302,7 +302,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, } /* Now we know that all array arguments have the same number of - dimensions */ + dimensions and that the expected output size is the size of a */ /* And copy their initial values in */ memcpy(ge->dims, a->dimensions, nd*sizeof(size_t)); @@ -315,47 +315,21 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, } /* Check that all arrays are the same size (or broadcast-compatible - if GE_BROADCAST). Also compute the total size and adjust strides - of broadcastable dimensions. + if GE_BROADCAST), adjust strides of broadcastable dimensions and + check if we can use the 32 bit address version. Basically for each dimension go over all the arguments and make sure that the dimension size matches. */ n = 1; for (j = 0; j < nd; j++) { p = 0; - output_ever_found = 0; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { v = (GpuArray *)args[i]; - if (ge->dims[j] == v->dimensions[j]) { - /* We check if this array is an output. */ - output_ever_found = output_ever_found || is_output(ge->args[i]); - } else { - if (ISCLR(flags, GE_BROADCAST)) { - return GA_VALUE_ERROR; - } - /* GE_BROADCAST is set */ - if (ge->dims[j] == 1) { - if (output_ever_found) { - /* There are outputs before the current array, - * and their (j+1)th dimension equals 1, so they would be - * broadcasted. We don't want that. */ - #ifdef DEBUG - fprintf(stderr, " (check_basic(): outputs should not be broadcasted) "); - #endif - return GA_VALUE_ERROR; - } - /* There are no outputs before the current array. - * So broadcasting can be done safely. */ - ge->dims[j] = v->dimensions[j]; - /* We still check if the current array is an output - * (useless in the current implementation, but coherent). */ - output_ever_found = output_ever_found || is_output(ge->args[i]); - } else if (v->dimensions[j] != 1 || is_output(ge->args[i])) { - #ifdef DEBUG - if (is_output(ge->args[i])) - fprintf(stderr, " (check_basic(): an output should not be broadcasted) "); - #endif + if (ge->dims[j] != v->dimensions[j]) { + /* We can't broadcast outputs */ + if (ISCLR(flags, GE_BROADCAST) || is_output(v) || + v->dimensions[j] != 1) { return GA_VALUE_ERROR; } } From b615a76c0f8a2b07035f675b595e85da587d7403 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 25 Nov 2016 11:33:35 -0500 Subject: [PATCH 108/597] Fix calling elemwise (basic) with scalars in the middle. --- src/gpuarray_elemwise.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 14a1db5556..c6833cd306 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -371,7 +371,7 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, size_t *dims, ssize_t **strs, int call32) { GpuKernel *k; size_t ls = 0, gs = 0; - unsigned int p = 0, i, j; + unsigned int p = 0, i, j, l; int err; if (nd == 0) return GA_VALUE_ERROR; @@ -398,6 +398,8 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, if (err != GA_NO_ERROR) goto error; } + /* l is the number of arrays to date */ + l = 0; for (j = 0; j < ge->n; j++) { if (is_array(ge->args[j])) { GpuArray *v = (GpuArray *)args[j]; @@ -406,9 +408,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, err = GpuKernel_setarg(k, p++, &v->offset); if (err != GA_NO_ERROR) goto error; for (i = 0; i < nd; i++) { - err = GpuKernel_setarg(k, p++, &strs[j][i]); + err = GpuKernel_setarg(k, p++, &strs[l][i]); if (err != GA_NO_ERROR) goto error; } + l++; } else { err = GpuKernel_setarg(k, p++, args[j]); if (err != GA_NO_ERROR) goto error; From 5c4d168ee4f5bbbe5e78462186be57c6ca9f7c53 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 25 Nov 2016 11:45:53 -0500 Subject: [PATCH 109/597] Add a testcase for elemwise basic where there is a scalar in the middle of the arguments. --- tests/check_elemwise.c | 73 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index d8893b8496..af19b57c15 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -362,6 +362,78 @@ START_TEST(test_basic_offset) { } END_TEST +START_TEST(test_basic_scalar) { + GpuArray a; + GpuArray b; + GpuArray c; + uint32_t x = 2; + + GpuElemwise *ge; + + static const uint32_t data1[3] = {1, 2, 3}; + static const uint32_t data2[3] = {4, 5}; + uint32_t data3[6] = {0}; + + size_t dims[2]; + + gpuelemwise_arg args[4] = {{0}}; + void *rargs[4]; + + dims[0] = 1; + dims[1] = 3; + + ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 2, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); + + dims[0] = 2; + dims[1] = 1; + + ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); + ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); + + dims[1] = 3; + + ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); + + args[0].name = "a"; + args[0].typecode = GA_UINT; + args[0].flags = GE_READ; + + args[1].name = "x"; + args[1].typecode = GA_UINT; + args[1].flags = GE_SCALAR; + + args[2].name = "b"; + args[2].typecode = GA_UINT; + args[2].flags = GE_READ; + + args[3].name = "c"; + args[3].typecode = GA_UINT; + args[3].flags = GE_WRITE; + + ge = GpuElemwise_new(ctx, "", "c = a + x * b", 4, args, 2, 0); + + ck_assert_ptr_ne(ge, NULL); + + rargs[0] = &a; + rargs[1] = &x; + rargs[2] = &b; + rargs[3] = &c; + + ga_assert_ok(GpuElemwise_call(ge, rargs, GE_BROADCAST)); + + ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); + + ck_assert_int_eq(data3[0], 9); + ck_assert_int_eq(data3[1], 10); + ck_assert_int_eq(data3[2], 11); + + ck_assert_int_eq(data3[3], 11); + ck_assert_int_eq(data3[4], 12); + ck_assert_int_eq(data3[5], 13); +} +END_TEST + START_TEST(test_basic_remove1) { GpuArray a; GpuArray b; @@ -680,6 +752,7 @@ Suite *get_suite(void) { tcase_add_checked_fixture(tc, setup, teardown); tcase_add_test(tc, test_basic_simple); tcase_add_test(tc, test_basic_f16); + tcase_add_test(tc, test_basic_scalar); tcase_add_test(tc, test_basic_offset); tcase_add_test(tc, test_basic_remove1); tcase_add_test(tc, test_basic_broadcast); From 55e96ec7d7d3ca70a1140b97973ebf64d007e94a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 25 Nov 2016 15:32:01 -0500 Subject: [PATCH 110/597] Add offset handling for indices and result in take1() --- src/gpuarray_array.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 3e33f58719..eef077e6d5 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -336,7 +336,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, int flags = GA_USE_CLUDA; int res; - nargs = 7 + 2 * v->nd; + nargs = 9 + 2 * v->nd; atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) @@ -351,11 +351,12 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, } apos = 0; - strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, " - "GLOBAL_MEM const %s *v, ga_size off,", + strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " + "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; + atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; for (i = 0; i < v->nd; i++) { @@ -363,12 +364,13 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } - strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size n0, ga_size n1," - " GLOBAL_MEM int* err) {\n", + strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, " + "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n", gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; + atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; assert(apos == nargs); strb_appendf(&sb, " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n" @@ -376,9 +378,14 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n" " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); + strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); + strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((char *)r) + r_off);\n" + " ind = (GLOBAL_MEM %s *)(((char *)ind) + i_off);\n", + gpuarray_get_type(a->typecode)->cluda_name, + gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" " %s ii0 = ind[i0];\n" - " %s pos0 = off;\n" + " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" " if ((ii0 < 0) || (ii0 >= d0)) {\n" " *err = -1;\n" @@ -500,13 +507,16 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, argp = 0; GpuKernel_setarg(&k, argp++, a->data); + GpuKernel_setarg(&k, argp++, (void *)&a->offset); GpuKernel_setarg(&k, argp++, v->data); + /* The cast is to avoid a warning about const */ GpuKernel_setarg(&k, argp++, (void *)&v->offset); for (j = 0; j < v->nd; j++) { GpuKernel_setarg(&k, argp++, &v->strides[j]); GpuKernel_setarg(&k, argp++, &v->dimensions[j]); } GpuKernel_setarg(&k, argp++, i->data); + GpuKernel_setarg(&k, argp++, (void *)&i->offset); GpuKernel_setarg(&k, argp++, &n[0]); GpuKernel_setarg(&k, argp++, &n[1]); GpuKernel_setarg(&k, argp++, errbuf); @@ -1083,11 +1093,15 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) { case GA_UINT: fprintf(fd, "%u", *(unsigned int *)p); break; + case GA_LONG: + fprintf(fd, "%lld", (long long)*(int64_t *)p); + break; case GA_SSIZE: fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p); break; default: free(buf); + fprintf(fd, "\n", a->typecode); return GA_UNSUPPORTED_ERROR; } s -= gpuarray_get_elsize(a->typecode); From 28e1ed560e04f7e105e649bd5989775adc42f39f Mon Sep 17 00:00:00 2001 From: khaotik Date: Thu, 24 Nov 2016 09:12:52 -0500 Subject: [PATCH 111/597] API for BLAS dot --- src/gpuarray/blas.h | 6 ++++++ src/gpuarray/buffer_blas.h | 15 +++++++++++++++ src/gpuarray_buffer_blas.c | 15 +++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index d43d07b348..aa7861c4af 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -8,6 +8,12 @@ extern "C" { #endif +// only for vector-vector dot +GPUARRAY_PUBLIC int GpuArray_dot( GpuArray *A, GpuArray *B, + GpuArray *C, int nocopy); +#define GpuArray_hdot GpuArray_rdot +#define GpuArray_sdot GpuArray_rdot +#define GpuArray_ddot GpuArray_rdot GPUARRAY_PUBLIC int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy); diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 6e36c33f37..f29788a1d8 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -38,6 +38,21 @@ GPUARRAY_PUBLIC void gpublas_teardown(gpucontext *ctx); GPUARRAY_PUBLIC const char *gpublas_error(gpucontext *ctx); +GPUARRAY_PUBLIC int gpublas_hdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +GPUARRAY_PUBLIC int gpublas_sdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +GPUARRAY_PUBLIC int gpublas_ddot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 417027e850..20371093bb 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -19,6 +19,21 @@ const char *gpublas_error(gpucontext *ctx) { return "No blas ops available, API error."; } +int gpublas_hdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +int gpublas_sdot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + +int gpublas_ddot( + size_t N, + gpudata *X, size_t offA, size_t incX, + gpudata *Y, size_t offB, size_t incY); + int gpublas_hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, From 136825b561bf94e5b08417bf801f9199c2bbd254 Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 04:02:47 -0500 Subject: [PATCH 112/597] Finish BLAS dot for implementation for CUDA Plus some minor changes: - did `chmod +x setup.py` - added interface for clblas --- setup.py | 0 src/gpuarray/blas.h | 4 +- src/gpuarray/buffer_blas.h | 15 +++-- src/gpuarray_array_blas.c | 91 +++++++++++++++++++++++++++++- src/gpuarray_blas_cuda_cublas.c | 94 +++++++++++++++++++++++++++++++ src/gpuarray_blas_opencl_clblas.c | 27 +++++++++ src/gpuarray_buffer_blas.c | 24 ++++++-- src/loaders/libclblas.fn | 6 +- src/loaders/libcublas.fn | 4 ++ src/private.h | 13 +++++ 10 files changed, 261 insertions(+), 17 deletions(-) mode change 100644 => 100755 setup.py diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index aa7861c4af..a8dd8096bc 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -9,8 +9,8 @@ extern "C" { #endif // only for vector-vector dot -GPUARRAY_PUBLIC int GpuArray_dot( GpuArray *A, GpuArray *B, - GpuArray *C, int nocopy); +GPUARRAY_PUBLIC int GpuArray_rdot( GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy); #define GpuArray_hdot GpuArray_rdot #define GpuArray_sdot GpuArray_rdot #define GpuArray_ddot GpuArray_rdot diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index f29788a1d8..56d1d4d2da 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -40,18 +40,21 @@ GPUARRAY_PUBLIC const char *gpublas_error(gpucontext *ctx); GPUARRAY_PUBLIC int gpublas_hdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); GPUARRAY_PUBLIC int gpublas_sdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); GPUARRAY_PUBLIC int gpublas_ddot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 9fb6216054..74cfa858af 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -5,6 +5,94 @@ #include "gpuarray/util.h" #include "gpuarray/error.h" +int GpuArray_rdot( GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy) { + GpuArray *Xp = X; + GpuArray copyX; + GpuArray *Yp = Y; + GpuArray copyY; + GpuArray *Zp = Z; + void *ctx; + size_t elsize; + size_t n; + int err; + + if (X->typecode != GA_HALF && + X->typecode != GA_FLOAT && + X->typecode != GA_DOUBLE) + return GA_INVALID_ERROR; + + if (X->nd != 1 || X->nd != 1 || Y->nd != 0 || + X->typecode != Y->typecode || X->typecode != Z->typecode) + return GA_VALUE_ERROR; + if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || + !(Z->flags & GA_ALIGNED)) + return GA_UNALIGNED_ERROR; + if (X->dimensions[0] != n || Y->dimensions[0] != n) + return GA_VALUE_ERROR; + + elsize = gpuarray_get_elsize(X->typecode); + if (X->strides[0] < 0) { + if (nocopy) + return GA_COPY_ERROR; + else { + err = GpuArray_copy(©X, X, GA_ANY_ORDER); + if (err != GA_NO_ERROR) + goto cleanup; + Xp = ©X; + } + } + if (Y->strides[0] < 0) { + if (nocopy) + return GA_COPY_ERROR; + else { + err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); + if (err != GA_NO_ERROR) + goto cleanup; + Yp = ©Y; + } + } + if (Z->strides[0] < 0) { + err = GA_VALUE_ERROR; + goto cleanup; + } + + ctx = gpudata_context(Xp->data); + err = gpublas_setup(ctx); + if (err != GA_NO_ERROR) + goto cleanup; + + switch (Xp->typecode) { + case GA_HALF: + err = gpublas_hdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data); + break; + case GA_FLOAT: + err = gpublas_sdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data); + break; + case GA_DOUBLE: + err = gpublas_sdot( + n, + Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, + Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, + Zp->data); + break; + } + cleanup: + if (Xp == ©X) + GpuArray_clear(©X); + if (Yp == ©Y) + GpuArray_clear(©Y); + return err; +} + int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *X, double beta, GpuArray *Y, int nocopy) { GpuArray *Ap = A; @@ -24,8 +112,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, return GA_INVALID_ERROR; if (A->nd != 2 || X->nd != 1 || Y->nd != 1 || - A->typecode != A->typecode || X->typecode != A->typecode || - Y->typecode != A->typecode) + X->typecode != A->typecode || Y->typecode != A->typecode) return GA_VALUE_ERROR; if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) || diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 03490a8c58..5f4b913b63 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -786,6 +786,97 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z + ) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + cuda_context *ctx = X->ctx; + blas_handle *h = (blas_handle *)ctx->blas_handle; + cublasPointerMode_t pmode; + + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(Z); + + if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + + cuda_enter(ctx); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + + // we should store dot result on device + cublasGetPointerMode(h->h, &pmode); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + h->err = cublasSdot( + h->h, N, + ((float*)X->ptr) + offX, incX, + ((float*)Y->ptr) + offY, incY, + ((float*)Z->ptr) + ); + cublasSetPointerMode(h->h, pmode); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + cuda_context *ctx = X->ctx; + blas_handle *h = (blas_handle *)ctx->blas_handle; + cublasPointerMode_t pmode; + + ASSERT_BUF(X); + ASSERT_BUF(Y); + ASSERT_BUF(Z); + + if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + + cuda_enter(ctx); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + + // we should store dot result on device + cublasGetPointerMode(h->h, &pmode); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + h->err = cublasDdot( + h->h, N, + ((double*)X->ptr) + offX, incX, + ((double*)Y->ptr) + offY, incY, + ((double*)Z->ptr) + ); + cublasSetPointerMode(h->h, pmode); + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + + cuda_exit(ctx); + + return GA_NO_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, @@ -1558,6 +1649,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, + ddot, hgemv, /* TODO */ sgemv, dgemv, diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 837a74af9b..5ecb982af6 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -194,6 +194,30 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, return GA_DEVSUP_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return GA_DEVSUP_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return GA_DEVSUP_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, @@ -400,6 +424,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, + ddot, hgemv, /* TODO */ sgemv, dgemv, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 20371093bb..08ecadf984 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -21,18 +21,30 @@ const char *gpublas_error(gpucontext *ctx) { int gpublas_hdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return gpudata_context(X)->blas_ops->hdot( + N, X, offX, incX, Y, offY, incY, Z); +} int gpublas_sdot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return gpudata_context(X)->blas_ops->sdot( + N, X, offX, incX, Y, offY, incY, Z); +} int gpublas_ddot( size_t N, - gpudata *X, size_t offA, size_t incX, - gpudata *Y, size_t offB, size_t incY); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z) { + return gpudata_context(X)->blas_ops->ddot( + N, X, offX, incX, Y, offY, incY, Z); +} int gpublas_hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/loaders/libclblas.fn b/src/loaders/libclblas.fn index 2ab7f5b2af..6a4c0ed2ba 100644 --- a/src/loaders/libclblas.fn +++ b/src/loaders/libclblas.fn @@ -1,8 +1,12 @@ DEF_PROC(clblasStatus, clblasSetup, (void)); DEF_PROC(void, clblasTeardown, (void)); + + DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); \ No newline at end of file +DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index 04b0290800..6af6589cc9 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -3,9 +3,13 @@ DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle)); DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId)); DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode)); +DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t* mode)); DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode)); +DEF_PROC_V2(cublasSdot, (cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result)); +DEF_PROC_V2(cublasDdot, (cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result)); + DEF_PROC_V2(cublasSgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc)); DEF_PROC_V2(cublasDgemm, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc)); diff --git a/src/private.h b/src/private.h index c0e0d7c5d1..3ae8a270ce 100644 --- a/src/private.h +++ b/src/private.h @@ -113,6 +113,19 @@ struct _gpuarray_blas_ops { int (*setup)(gpucontext *ctx); void (*teardown)(gpucontext *ctx); const char *(*error)(gpucontext *ctx); + + int (*hdot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); + int (*sdot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); + int (*ddot)( size_t N, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, From 94b600c5db96dfc3d21c842464b194623e9af208 Mon Sep 17 00:00:00 2001 From: khaotik Date: Thu, 24 Nov 2016 09:12:52 -0500 Subject: [PATCH 113/597] API for BLAS dot --- src/gpuarray/buffer_blas.h | 6 +++--- src/gpuarray_buffer_blas.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 56d1d4d2da..859ede62b7 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -42,19 +42,19 @@ GPUARRAY_PUBLIC int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *Z, size_t offZ); GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 08ecadf984..f3447dd2cd 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,18 +23,18 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( - N, X, offX, incX, Y, offY, incY, Z); + N, X, offX, incX, Y, offY, incY, Z, offZ); } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( - N, X, offX, incX, Y, offY, incY, Z); + N, X, offX, incX, Y, offY, incY, Z, offZ); } int gpublas_ddot( @@ -43,7 +43,7 @@ int gpublas_ddot( gpudata *Y, size_t offY, size_t incY, gpudata *Z) { return gpudata_context(X)->blas_ops->ddot( - N, X, offX, incX, Y, offY, incY, Z); + N, X, offX, incX, Y, offY, incY, Z, offZ); } int gpublas_hgemv(cb_order order, cb_transpose transA, From c0e666371c94719834e12a7c52ddc3d0b5166b11 Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 04:02:47 -0500 Subject: [PATCH 114/597] Finish BLAS dot for implementation for CUDA Plus some minor changes: - did `chmod +x setup.py` - added interface for clblas --- src/gpuarray/buffer_blas.h | 12 ++++++++++++ src/gpuarray_buffer_blas.c | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 859ede62b7..9774ff4d17 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -42,19 +42,31 @@ GPUARRAY_PUBLIC int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); +======= + gpudata *Z); +>>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); +======= + gpudata *Z); +>>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); +======= + gpudata *Z); +>>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index f3447dd2cd..383a4a365d 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,18 +23,30 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( N, X, offX, incX, Y, offY, incY, Z, offZ); +======= + gpudata *Z) { + return gpudata_context(X)->blas_ops->hdot( + N, X, offX, incX, Y, offY, incY, Z); +>>>>>>> Finish BLAS dot for implementation for CUDA } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( N, X, offX, incX, Y, offY, incY, Z, offZ); +======= + gpudata *Z) { + return gpudata_context(X)->blas_ops->sdot( + N, X, offX, incX, Y, offY, incY, Z); +>>>>>>> Finish BLAS dot for implementation for CUDA } int gpublas_ddot( @@ -43,7 +55,11 @@ int gpublas_ddot( gpudata *Y, size_t offY, size_t incY, gpudata *Z) { return gpudata_context(X)->blas_ops->ddot( +<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 N, X, offX, incX, Y, offY, incY, Z, offZ); +======= + N, X, offX, incX, Y, offY, incY, Z); +>>>>>>> Finish BLAS dot for implementation for CUDA } int gpublas_hgemv(cb_order order, cb_transpose transA, From 81e030c3ebd8b508404953b28e538027a0c41094 Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 07:48:28 -0500 Subject: [PATCH 115/597] fix/cleanup --- pygpu/tests/test_blas.py | 4 ++++ src/gpuarray/buffer_blas.h | 12 ------------ src/gpuarray_array_blas.c | 19 ++++++++----------- src/gpuarray_blas_cuda_cublas.c | 18 ++++++++---------- src/gpuarray_buffer_blas.c | 18 +++++++++++++++++- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 119ef8e959..532e4c8fc3 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -14,6 +14,10 @@ import pygpu.blas as gblas +def test_dot(): + # TODO [WIP] + raise NotImplementedError() + def test_gemv(): for shape in [(100, 128), (128, 50)]: for order in ['f', 'c']: diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 9774ff4d17..859ede62b7 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -42,31 +42,19 @@ GPUARRAY_PUBLIC int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); -======= - gpudata *Z); ->>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); -======= - gpudata *Z); ->>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ); -======= - gpudata *Z); ->>>>>>> Finish BLAS dot for implementation for CUDA GPUARRAY_PUBLIC int gpublas_hgemv( cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 74cfa858af..8f9fb5919b 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -12,9 +12,9 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, GpuArray *Yp = Y; GpuArray copyY; GpuArray *Zp = Z; + size_t n; void *ctx; size_t elsize; - size_t n; int err; if (X->typecode != GA_HALF && @@ -22,13 +22,14 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, X->typecode != GA_DOUBLE) return GA_INVALID_ERROR; - if (X->nd != 1 || X->nd != 1 || Y->nd != 0 || + if (X->nd != 1 || Y->nd != 1 || Z->nd != 0 || X->typecode != Y->typecode || X->typecode != Z->typecode) return GA_VALUE_ERROR; + n = X->dimensions[0]; if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(Z->flags & GA_ALIGNED)) return GA_UNALIGNED_ERROR; - if (X->dimensions[0] != n || Y->dimensions[0] != n) + if (X->dimensions[0] != Y->dimensions[0]) return GA_VALUE_ERROR; elsize = gpuarray_get_elsize(X->typecode); @@ -52,10 +53,6 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, Yp = ©Y; } } - if (Z->strides[0] < 0) { - err = GA_VALUE_ERROR; - goto cleanup; - } ctx = gpudata_context(Xp->data); err = gpublas_setup(ctx); @@ -68,21 +65,21 @@ int GpuArray_rdot( GpuArray *X, GpuArray *Y, n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, - Zp->data); + Zp->data, Zp->offset / elsize); break; case GA_FLOAT: err = gpublas_sdot( n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, - Zp->data); + Zp->data, Zp->offset / elsize); break; case GA_DOUBLE: - err = gpublas_sdot( + err = gpublas_ddot( n, Xp->data, Xp->offset / elsize, Xp->strides[0] / elsize, Yp->data, Yp->offset / elsize, Yp->strides[0] / elsize, - Zp->data); + Zp->data, Zp->offset / elsize); break; } cleanup: diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 5f4b913b63..1897a492dc 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -814,22 +814,21 @@ static int sdot( GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device cublasGetPointerMode(h->h, &pmode); - cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); h->err = cublasSdot( h->h, N, ((float*)X->ptr) + offX, incX, ((float*)Y->ptr) + offY, incY, - ((float*)Z->ptr) - ); + ((float*)Z->ptr) + offZ); cublasSetPointerMode(h->h, pmode); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); cuda_exit(ctx); @@ -855,22 +854,21 @@ static int ddot( GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device cublasGetPointerMode(h->h, &pmode); - cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_HOST); + cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); h->err = cublasDdot( h->h, N, ((double*)X->ptr) + offX, incX, ((double*)Y->ptr) + offY, incY, - ((double*)Z->ptr) - ); + ((double*)Z->ptr) + offZ); cublasSetPointerMode(h->h, pmode); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Z, CUDA_WAIT_WRITE)); cuda_exit(ctx); diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 383a4a365d..08f6f5fd1e 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,6 +23,7 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 <<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( @@ -32,12 +33,18 @@ int gpublas_hdot( return gpudata_context(X)->blas_ops->hdot( N, X, offX, incX, Y, offY, incY, Z); >>>>>>> Finish BLAS dot for implementation for CUDA +======= + gpudata *Z, size_t offZ) { + return gpudata_context(X)->blas_ops->hdot( + N, X, offX, incX, Y, offY, incY, Z, offZ); +>>>>>>> fix/cleanup } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, +<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 <<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( @@ -47,19 +54,28 @@ int gpublas_sdot( return gpudata_context(X)->blas_ops->sdot( N, X, offX, incX, Y, offY, incY, Z); >>>>>>> Finish BLAS dot for implementation for CUDA +======= + gpudata *Z, size_t offZ) { + return gpudata_context(X)->blas_ops->sdot( + N, X, offX, incX, Y, offY, incY, Z, offZ); +>>>>>>> fix/cleanup } int gpublas_ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->ddot( +<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 <<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 N, X, offX, incX, Y, offY, incY, Z, offZ); ======= N, X, offX, incX, Y, offY, incY, Z); >>>>>>> Finish BLAS dot for implementation for CUDA +======= + N, X, offX, incX, Y, offY, incY, Z, offZ); +>>>>>>> fix/cleanup } int gpublas_hgemv(cb_order order, cb_transpose transA, From 556ced0999c660407e4173ef4530cffe8dc06f2d Mon Sep 17 00:00:00 2001 From: khaotik Date: Fri, 25 Nov 2016 11:34:43 -0500 Subject: [PATCH 116/597] fixed/more pygpu interface --- pygpu/blas.pyx | 8 ++++++++ src/gpuarray_blas_cuda_cublas.c | 7 +++---- src/gpuarray_blas_opencl_clblas.c | 6 +++--- src/loaders/libclblas.fn | 8 ++++---- src/private.h | 18 +++++++++--------- 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index f83322d0a0..7b27d63350 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -10,6 +10,7 @@ cdef extern from "gpuarray/buffer_blas.h": cb_conj_trans cdef extern from "gpuarray/blas.h": + int GpuArray_rdot(_GpuArray *X, _GpuArray *Y, _GpuArray *Z, int nocopy) int GpuArray_rgemv(cb_transpose transA, double alpha, _GpuArray *A, _GpuArray *X, double beta, _GpuArray *Y, int nocopy) int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, @@ -18,6 +19,13 @@ cdef extern from "gpuarray/blas.h": int GpuArray_rger(double alpha, _GpuArray *X, _GpuArray *Y, _GpuArray *A, int nocopy) +cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) except -1: + cdef int err + err = GpuArray_rdot(&X.ga, &Y.ga, &Z.ga, nocopy) + if err != GA_NO_ERROR: + raise GpuArrayException(Gpurray_error(&X.ga, err), err) + return 0 + cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, GpuArray X, double beta, GpuArray Y, bint nocopy) except -1: diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 1897a492dc..39cba704f5 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -790,8 +790,7 @@ static int hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z - ) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } @@ -799,7 +798,7 @@ static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; cublasPointerMode_t pmode; @@ -839,7 +838,7 @@ static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; cublasPointerMode_t pmode; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 5ecb982af6..d2f5e3302f 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -198,7 +198,7 @@ static int hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } @@ -206,7 +206,7 @@ static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } @@ -214,7 +214,7 @@ static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, - gpudata *Z) { + gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } diff --git a/src/loaders/libclblas.fn b/src/loaders/libclblas.fn index 6a4c0ed2ba..f56a2a1393 100644 --- a/src/loaders/libclblas.fn +++ b/src/loaders/libclblas.fn @@ -2,11 +2,11 @@ DEF_PROC(clblasStatus, clblasSetup, (void)); DEF_PROC(void, clblasTeardown, (void)); -DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDgemv, (clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasSgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); +DEF_PROC(clblasStatus, clblasDgemm, (clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasSger, (clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); DEF_PROC(clblasStatus, clblasDger, (clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasSdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); -DEF_PROC(clblasStatus, clblasDdot, (size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)); diff --git a/src/private.h b/src/private.h index 3ae8a270ce..57d919be88 100644 --- a/src/private.h +++ b/src/private.h @@ -115,17 +115,17 @@ struct _gpuarray_blas_ops { const char *(*error)(gpucontext *ctx); int (*hdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); int (*sdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); int (*ddot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z); + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, + gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, From 6b8518d5eac559364ab91fea5362212b7c8274c6 Mon Sep 17 00:00:00 2001 From: khaotik Date: Sat, 26 Nov 2016 05:28:40 -0500 Subject: [PATCH 117/597] get rid of conflict --- src/gpuarray_buffer_blas.c | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 08f6f5fd1e..c73f3c2f19 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -23,42 +23,18 @@ int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->hdot( N, X, offX, incX, Y, offY, incY, Z, offZ); -======= - gpudata *Z) { - return gpudata_context(X)->blas_ops->hdot( - N, X, offX, incX, Y, offY, incY, Z); ->>>>>>> Finish BLAS dot for implementation for CUDA -======= - gpudata *Z, size_t offZ) { - return gpudata_context(X)->blas_ops->hdot( - N, X, offX, incX, Y, offY, incY, Z, offZ); ->>>>>>> fix/cleanup } int gpublas_sdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, -<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->sdot( N, X, offX, incX, Y, offY, incY, Z, offZ); -======= - gpudata *Z) { - return gpudata_context(X)->blas_ops->sdot( - N, X, offX, incX, Y, offY, incY, Z); ->>>>>>> Finish BLAS dot for implementation for CUDA -======= - gpudata *Z, size_t offZ) { - return gpudata_context(X)->blas_ops->sdot( - N, X, offX, incX, Y, offY, incY, Z, offZ); ->>>>>>> fix/cleanup } int gpublas_ddot( @@ -67,15 +43,7 @@ int gpublas_ddot( gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { return gpudata_context(X)->blas_ops->ddot( -<<<<<<< c0e666371c94719834e12a7c52ddc3d0b5166b11 -<<<<<<< 94b600c5db96dfc3d21c842464b194623e9af208 - N, X, offX, incX, Y, offY, incY, Z, offZ); -======= - N, X, offX, incX, Y, offY, incY, Z); ->>>>>>> Finish BLAS dot for implementation for CUDA -======= N, X, offX, incX, Y, offY, incY, Z, offZ); ->>>>>>> fix/cleanup } int gpublas_hgemv(cb_order order, cb_transpose transA, From b5b20d74cd2336263629a73f02f941d8c4ce75b3 Mon Sep 17 00:00:00 2001 From: khaotik Date: Sat, 26 Nov 2016 05:43:07 -0500 Subject: [PATCH 118/597] make all inc* arguments as type int --- src/gpuarray_blas_cuda_cublas.c | 60 +++++++++++++++--------------- src/gpuarray_blas_opencl_clblas.c | 36 +++++++++--------- src/gpuarray_blas_opencl_clblast.c | 54 +++++++++++++++++++++------ src/private.h | 36 +++++++++--------- 4 files changed, 108 insertions(+), 78 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 39cba704f5..41e6e372c0 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -39,8 +39,8 @@ typedef struct _blas_handle { static const char *code_sgemvBH_N_a1_b1_small = \ "extern \"C\"__global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], size_t incx, " \ - " float *y[], size_t incy, " \ + " const float *x[], int incx, " \ + " float *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -62,8 +62,8 @@ static const char *code_sgemvBH_N_a1_b1_small = \ static const char *code_sgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], size_t incx, " \ - " float *y[], size_t incy, " \ + " const float *x[], int incx, " \ + " float *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -95,8 +95,8 @@ static const char *atomicadd_double = \ static const char *code_dgemvBH_N_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], size_t incx, " \ - " double *y[], size_t incy, " \ + " const double *x[], int incx, " \ + " double *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -118,8 +118,8 @@ static const char *code_dgemvBH_N_a1_b1_small = \ static const char *code_dgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], size_t incx, " \ - " double *y[], size_t incy, " \ + " const double *x[], int incx, " \ + " double *y[], int incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -137,8 +137,8 @@ static const char *code_dgemvBH_T_a1_b1_small = \ static const char *code_sgerBH_gen_small = \ "extern \"C\" __global__ void _sgerBH_gen_small(" \ - " const float *x[], size_t incx," \ - " const float *y[], size_t incy," \ + " const float *x[], int incx," \ + " const float *y[], int incy," \ " float alpha, float *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -152,8 +152,8 @@ static const char *code_sgerBH_gen_small = \ static const char *code_dgerBH_gen_small = \ "extern \"C\" __global__ void _dgerBH_gen_small(" \ - " const double *x[], size_t incx, " \ - " const double *y[], size_t incy," \ + " const double *x[], int incx, " \ + " const double *y[], int incy," \ " double alpha, double *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -788,16 +788,16 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -836,8 +836,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -994,8 +994,8 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -1003,8 +1003,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { /* Flags is there for possible future implementations where we might not use atomics or have some alternate implemntation. */ @@ -1129,8 +1129,8 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; @@ -1371,16 +1371,16 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; @@ -1511,8 +1511,8 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index d2f5e3302f..91f31d0728 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -146,8 +146,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -155,8 +155,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -164,31 +164,31 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; @@ -196,24 +196,24 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, static int hdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int ddot( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 28b164799f..19f7fc55b7 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -141,8 +141,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -150,8 +150,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -159,36 +159,63 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } +static int hdot( + size_t N, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *Z, size_t offZ + ) { + return GA_DEVSUP_ERROR; +} + +static int sdot( + size_t N, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *Z, size_t offZ + ) { + return GA_DEVSUP_ERROR; +} + +static int ddot( + size_t N, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, + gpudata *Z, size_t offZ + ) { + return GA_DEVSUP_ERROR; +} + static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, @@ -436,6 +463,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { setup, teardown, error, + hdot, /* TODO */ + sdot, /* TODO */ + ddot, /* TODO */ hgemv, sgemv, dgemv, diff --git a/src/private.h b/src/private.h index 57d919be88..ed8ce63293 100644 --- a/src/private.h +++ b/src/private.h @@ -115,16 +115,16 @@ struct _gpuarray_blas_ops { const char *(*error)(gpucontext *ctx); int (*hdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ); int (*sdot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ); int (*ddot)( size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, @@ -186,34 +186,34 @@ struct _gpuarray_blas_ops { int (*hgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags); int (*sgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + float beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags); int (*dgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + double beta, gpudata **y, size_t *offY, int incY, size_t batchCount, int flags); int (*hgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*sgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*dgerBatch)(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, + gpudata **x, size_t *offX, int incX, + gpudata **y, size_t *offY, int incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); }; From bb845624bdee486f05bb61e3da8a5b8e4e34ff7d Mon Sep 17 00:00:00 2001 From: khaotik Date: Sat, 26 Nov 2016 17:10:58 -0500 Subject: [PATCH 119/597] tests for blas dot - Added tests for BLAS dot - Implementation for CLBlast - modified blas tests from using nested for loops to itertools.product for parametrized tests. --- pygpu/blas.pyx | 12 +++- pygpu/tests/test_blas.py | 111 ++++++++++++++++------------- src/gpuarray_blas_opencl_clblast.c | 84 +++++++++++++++++++--- src/loaders/libclblast.fn | 3 + 4 files changed, 149 insertions(+), 61 deletions(-) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index 7b27d63350..14d90c0f76 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -23,7 +23,7 @@ cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) ex cdef int err err = GpuArray_rdot(&X.ga, &Y.ga, &Z.ga, nocopy) if err != GA_NO_ERROR: - raise GpuArrayException(Gpurray_error(&X.ga, err), err) + raise GpuArrayException(GpuArray_error(&X.ga, err), err) return 0 cdef api int pygpu_blas_rgemv(cb_transpose transA, double alpha, GpuArray A, @@ -53,6 +53,16 @@ cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, return 0 +def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False): + if Z is None: + Z = pygpu_empty(0, NULL, X.typecode, GA_ANY_ORDER, X.context, None) + overwrite_z = True + + if not overwrite_z: + Z = pygpu_copy(Z, GA_ANY_ORDER) + pygpu_blas_rdot(X, Y, Z, 0) + return Z + def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, trans_a=False, overwrite_y=False): cdef cb_transpose transA diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 532e4c8fc3..6b24ceea5d 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -1,4 +1,5 @@ -import numpy +from itertools import product +import numpy from nose.plugins.skip import SkipTest from .support import (guard_devsup, gen_gpuarray, context) @@ -15,27 +16,47 @@ import pygpu.blas as gblas def test_dot(): - # TODO [WIP] - raise NotImplementedError() + bools = [True, False] + for N, dtype, offseted_i, sliced in product( + [1, 256, 1337], ['float32', 'float64'], bools, bools): + yield dot, N, dtype, offseted_i, sliced, True, False + for overwrite, init_z in product(bools, bools): + yield dot, 666, 'float32', False, False, overwrite, init_z + +@guard_devsup +def dot(N, dtype, offseted_i, sliced, overwrite, init_z): + cX, gX = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + cY, gY = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + if init_z: + _, gZ = gen_gpuarray((), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context) + else: + _, gZ = None, None + + if dtype == 'float32': + cr = fblas.sdot(cX, cY) + else: + cr = fblas.ddot(cX, cY) + gr = gblas.dot(gX, gY, gZ, overwrite_z=overwrite) + numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) + def test_gemv(): - for shape in [(100, 128), (128, 50)]: - for order in ['f', 'c']: - for trans in [False, True]: - for offseted_i in [True, False]: - for sliced in [1, 2, -1, -2]: - yield gemv, shape, 'float32', order, trans, \ - offseted_i, sliced, True, False - for overwrite in [True, False]: - for init_y in [True, False]: - yield gemv, (4, 3), 'float32', 'f', False, False, 1, \ - overwrite, init_y + bools = [False, True] + for shape, order, trans, offseted_i, sliced in product( + [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]): + yield gemv, shape, 'float32', order, trans, \ + offseted_i, sliced, True, False + for overwrite, init_y in product(bools, bools): + yield gemv, (4, 3), 'float32', 'f', False, False, 1, \ + overwrite, init_y yield gemv, (32, 32), 'float64', 'f', False, False, 1, True, False - for alpha in [0, 1, -1, 0.6]: - for beta in [0, 1, -1, 0.6]: - for overwite in [True, False]: - yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ - overwrite, True, alpha, beta + for alpha, beta, overwrite in product( + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ + overwrite, True, alpha, beta @guard_devsup @@ -69,28 +90,22 @@ def gemv(shp, dtype, order, trans, offseted_i, sliced, def test_gemm(): - for m, n, k in [(48, 15, 32), (15, 32, 48)]: - for order in [('f', 'f', 'f'), ('c', 'c', 'c'), - ('f', 'f', 'c'), ('f', 'c', 'f'), - ('f', 'c', 'c'), ('c', 'f', 'f'), - ('c', 'f', 'c'), ('c', 'c', 'f')]: - for trans in [(False, False), (True, True), - (False, True), (True, False)]: - for offseted_o in [False, True]: - yield gemm, m, n, k, 'float32', order, trans, \ - offseted_o, 1, False, False - for sliced in [1, 2, -1, -2]: - for overwrite in [True, False]: - for init_res in [True, False]: - yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ - (False, False), False, sliced, overwrite, init_res + bools = [False, True] + for (m, n, k), order, trans, offseted_o in product( + [(48, 15, 32), (15, 32, 48)], list(product(*['fc']*3)), + list(product(bools, bools)), bools): + yield gemm, m, n, k, 'float32', order, trans, \ + offseted_o, 1, False, False + for sliced, overwrite, init_res in product( + [1, 2, -1, -2], bools, bools): + yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ + (False, False), False, sliced, overwrite, init_res yield gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), \ False, 1, False, False - for alpha in [0, 1, -1, 0.6]: - for beta in [0, 1, -1, 0.6]: - for overwrite in [True, False]: - yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ - (False, False), False, 1, overwrite, True, alpha, beta + for alpha, beta, overwrite in product( + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ + (False, False), False, 1, overwrite, True, alpha, beta @guard_devsup def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, @@ -128,19 +143,13 @@ def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, def test_ger(): - for m, n in [(4, 5)]: - for order in ['f', 'c']: - for sliced_x in [1, 2, -2, -1]: - for sliced_y in [1, 2, -2, -1]: - yield ger, m, n, 'float32', order, sliced_x, sliced_y, \ - False - + bools = [False, True] + for (m,n), order, sliced_x, sliced_y in product( + [(4,5)], 'fc', [1, 2, -2, -1], [1, 2, -2, -1]): + yield ger, m, n, 'float32', order, sliced_x, sliced_y, False yield ger, 4, 5, 'float64', 'f', 1, 1, False - - for init_res in [True, False]: - for overwrite in [True, False]: - yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite - + for init_res, overwrite in product(bools, bools): + yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): cX, gX = gen_gpuarray((m,), dtype, order, sliced=sliced_x, ctx=context) diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 19f7fc55b7..0827f936e7 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -193,27 +193,93 @@ static int hdot( size_t N, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, - gpudata *Z, size_t offZ - ) { - return GA_DEVSUP_ERROR; + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastHdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int sdot( size_t N, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, - gpudata *Z, size_t offZ - ) { - return GA_DEVSUP_ERROR; + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastSdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int ddot( size_t N, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, - gpudata *Z, size_t offZ - ) { - return GA_DEVSUP_ERROR; + gpudata *Z, size_t offZ) { + cl_ctx *ctx = X->ctx; + StatusCode err; + cl_event ev; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = CLBlastDdot( + N, + Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + &ctx->q, &ev); + if (err != kSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, diff --git a/src/loaders/libclblast.fn b/src/loaders/libclblast.fn index 544c164e0c..28f36ba20b 100644 --- a/src/loaders/libclblast.fn +++ b/src/loaders/libclblast.fn @@ -1,3 +1,6 @@ +DEF_PROC(StatusCode, CLBlastHdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(StatusCode, CLBlastSdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(StatusCode, CLBlastDdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); DEF_PROC(StatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); DEF_PROC(StatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); From bec7a725a21999f138aacfcb01b039d8ccbc07ca Mon Sep 17 00:00:00 2001 From: khaotik Date: Sun, 27 Nov 2016 04:25:16 -0500 Subject: [PATCH 120/597] finish dot for clBLAS --- src/gpuarray_blas_opencl_clblas.c | 72 ++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 91f31d0728..1028ce5e70 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -1,6 +1,7 @@ #include "private.h" #include "private_opencl.h" +#include "loaders/libopencl.h" #include "loaders/libclblas.h" #include "gpuarray/buffer_blas.h" @@ -207,7 +208,41 @@ static int sdot( gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { - return GA_DEVSUP_ERROR; + cl_ctx *ctx = X->ctx; + clblasStatus err; + cl_int cl_err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + cl_mem scratch_mem; + + scratch_mem = clCreateBuffer( + ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); + if (cl_err != CL_SUCCESS) + return GA_MEMORY_ERROR; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + // TODO: a thread-safe static buffer or allocator? + err = clblasSdot( + N, Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + scratch_mem, 1, &ctx->q, + num_ev, num_ev ? evl : NULL, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseMemObject(scratch_mem); + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int ddot( @@ -215,7 +250,40 @@ static int ddot( gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *Z, size_t offZ) { - return GA_DEVSUP_ERROR; + cl_ctx *ctx = X->ctx; + clblasStatus err; + cl_int cl_err; + cl_uint num_ev = 0; + cl_event evl[3]; + cl_event ev; + cl_mem scratch_mem; + + scratch_mem = clCreateBuffer( + ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); + if (cl_err != CL_SUCCESS) + return GA_MEMORY_ERROR; + + ARRAY_INIT(X); + ARRAY_INIT(Y); + ARRAY_INIT(Z); + + err = clblasDdot( + N, Z->buf, offZ, + X->buf, offX, incX, + Y->buf, offY, incY, + scratch_mem, 1, &ctx->q, + num_ev, num_ev ? evl : NULL, &ev); + if (err != clblasSuccess) + return GA_BLAS_ERROR; + + ARRAY_FINI(X); + ARRAY_FINI(Y); + ARRAY_FINI(Z); + + clReleaseMemObject(scratch_mem); + clReleaseEvent(ev); + + return GA_NO_ERROR; } static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, From 1956a7ba08bcee1b829afdb1d9f611c40d4d3f75 Mon Sep 17 00:00:00 2001 From: khaotik Date: Sun, 27 Nov 2016 04:46:38 -0500 Subject: [PATCH 121/597] minifixes --- src/gpuarray_blas_opencl_clblas.c | 2 +- src/loaders/libcublas.fn | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 1028ce5e70..5cb926c61b 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -259,7 +259,7 @@ static int ddot( cl_mem scratch_mem; scratch_mem = clCreateBuffer( - ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); + ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(double), NULL, &cl_err); if (cl_err != CL_SUCCESS) return GA_MEMORY_ERROR; diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index 6af6589cc9..c0dbddf41e 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -3,7 +3,7 @@ DEF_PROC_V2(cublasDestroy, (cublasHandle_t handle)); DEF_PROC_V2(cublasSetStream, (cublasHandle_t handle, cudaStream_t streamId)); DEF_PROC_V2(cublasSetPointerMode, (cublasHandle_t handle, cublasPointerMode_t mode)); -DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t* mode)); +DEF_PROC_V2(cublasGetPointerMode, (cublasHandle_t handle, cublasPointerMode_t *mode)); DEF_PROC(cublasSetAtomicsMode, (cublasHandle_t handle, cublasAtomicsMode_t mode)); From 618d0a17de905c70ac3e4f4c3b0ff579207e4f46 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 28 Nov 2016 10:20:54 -0500 Subject: [PATCH 122/597] I rewrote a little the code in gpuarray_elemwise (not really better nor faster, but I hope, clearer!). I also removed my very insufficient test from C code and I rewrote the Python test with a new version of elemwise2 that does not adapt the output shape (then we can check in Python if errors are raised when outputs have not the right shape). --- pygpu/tests/test_elemwise.py | 50 +++++++++++++++++++++++++++++------- src/gpuarray_elemwise.c | 16 ++++++------ tests/check_elemwise.c | 10 ++------ 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/pygpu/tests/test_elemwise.py b/pygpu/tests/test_elemwise.py index a1c752fa7f..89b59b14f6 100644 --- a/pygpu/tests/test_elemwise.py +++ b/pygpu/tests/test_elemwise.py @@ -3,7 +3,9 @@ from unittest import TestCase from pygpu import gpuarray, ndgpuarray as elemary -from pygpu.elemwise import ielemwise2 +from pygpu.dtypes import dtype_to_ctype, get_common_dtype +from pygpu.elemwise import as_argument, ielemwise2 +from pygpu._elemwise import GpuElemwise, arg from six import PY2 @@ -61,21 +63,51 @@ def test_ielemwise2_ops_array(): yield ielemwise2_ops_array, op, dtype1, dtype2, (50,) -class test_elemwise_rw_args_not_broadcasted(TestCase): - def test(self): - for shapea, shapeb in [((1, 4), (6, 4)), ((2, 1, 8, 7), (2, 2, 8, 7))]: - self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb) - for shapea, shapeb in [((6, 4), (1, 4)), ((2, 2, 8, 7), (2, 1, 8, 7))]: - self.run_ielemwise2(shapea, shapeb) +class test_elemwise_output_not_broadcasted(TestCase): + def test_all(self): + test_values = [((1, 4), (6, 4)), ((2, 1, 8, 7), (2, 2, 8, 7))] + for shapea, shapeb in test_values: + # Sould fail: dimensions are not all equal. + self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, False) + # Should fail: broascast should not be done on output. + self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, True) + # Should fail: dimensions are not all equal. + self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, shapea, False) + # Should fail: broadcast should not be done on output. + self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, shapea, True) + # Should pass: output would be done on read-only input. + self.run_ielemwise2(shapeb, shapea, broadcast=True) + # Should pass: output would be done on read-only inputs. + self.check_elemwise2(shapea, shapea, shapeb, broadcast=True) + self.check_elemwise2(shapea, shapeb, shapeb, broadcast=True) + self.check_elemwise2(shapeb, shapea, shapeb, broadcast=True) @guard_devsup - def run_ielemwise2(self, shapea, shapeb): + def run_ielemwise2(self, shapea, shapeb, broadcast=True): na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) - ielemwise2(ga, '+', gb, broadcast=True) + ielemwise2(ga, '+', gb, broadcast=broadcast) na += nb assert numpy.allclose(na, numpy.asarray(ga), atol=1e-6) + @guard_devsup + def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True): + # We rewrite this version of elemwise2 to skip the scaling of output + # that is done in the official elemwise2 function. + na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) + nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) + odtype = get_common_dtype(ga, gb, True) + res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context, cls=ga.__class__) + a_arg = as_argument(ga, 'a', read=True) + b_arg = as_argument(gb, 'b', read=True) + res_arg = as_argument(res, 'res', write=True) + args = [res_arg, a_arg, b_arg] + oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % {'op': '+', 'out_t': dtype_to_ctype(odtype)} + k = GpuElemwise(ga.context, oper, args, convert_f16=True) + k(res, ga, gb, broadcast=broadcast) + nres = na + nb + assert numpy.allclose(nres, numpy.asarray(res), atol=1e-6) + @guard_devsup def elemwise2_ops_array(op, dtype1, dtype2, shape): diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 6037978f6f..227674e070 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -279,18 +279,18 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, /* Go through the list and grab some info */ for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { - num_arrays++; - if (a == NULL || !is_output(a)) { - a = (GpuArray *)args[i]; - nd = a->nd; - } - if (((GpuArray *)args[i])->nd != nd) + if (num_arrays == 0) + nd = ((GpuArray *)args[i])->nd; + else if (((GpuArray *)args[i])->nd != nd) return GA_VALUE_ERROR; + ++num_arrays; + if (a == NULL && is_output(ge->args[i])) + a = (GpuArray *)args[i]; } } /* No output arrays, this is an error */ - if (a == NULL || !is_output(a)) + if (a == NULL) return GA_VALUE_ERROR; /* Check if we need to grow the internal buffers */ @@ -328,7 +328,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, v = (GpuArray *)args[i]; if (ge->dims[j] != v->dimensions[j]) { /* We can't broadcast outputs */ - if (ISCLR(flags, GE_BROADCAST) || is_output(v) || + if (ISCLR(flags, GE_BROADCAST) || is_output(ge->args[i]) || v->dimensions[j] != 1) { return GA_VALUE_ERROR; } diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index 70cbd4b5a8..d8893b8496 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -452,8 +452,8 @@ START_TEST(test_basic_broadcast) { ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); - dims[0] = 1; - dims[1] = 6; + dims[0] = 2; + dims[1] = 3; ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); @@ -479,12 +479,6 @@ START_TEST(test_basic_broadcast) { ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE), GA_VALUE_ERROR); - ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST), GA_VALUE_ERROR); - - dims[0] = 2; - dims[1] = 3; - - ga_assert_ok(GpuArray_reshape_inplace(&c, 2, dims, GA_ANY_ORDER)); ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST)); ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); From 3848b988c2d9b1eb06b8347a907868b6e695b998 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 21 Jul 2016 12:21:26 -0400 Subject: [PATCH 123/597] Disable our double atomicAdd for arch 600 and up. --- src/gpuarray_blas_cuda_cublas.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 03490a8c58..8d6ac5ed53 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -80,6 +80,7 @@ static const char *code_sgemvBH_T_a1_b1_small = \ "}\n"; static const char *atomicadd_double = \ + "#if __CUDA_ARCH__ < 600\n" \ "__device__ double atomicAdd(double* address, double val) {" \ " unsigned long long int* address_as_ull =" \ " (unsigned long long int*)address;" \ @@ -91,7 +92,8 @@ static const char *atomicadd_double = \ " __longlong_as_double(assumed)));" \ " } while (assumed != old);" \ " return __longlong_as_double(old);" \ - "}\n"; + "}\n" \ + "#endif\n"; static const char *code_dgemvBH_N_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ From 7679e9227babca7be8589687b9397cac4646b95f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 28 Nov 2016 13:27:17 -0500 Subject: [PATCH 124/597] Add a test that will execise the blas code. It's not complete but it's a start. --- tests/CMakeLists.txt | 4 ++++ tests/check_blas.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 tests/check_blas.c diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2f3eb801cf..aee8d7200c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -61,6 +61,10 @@ add_executable(check_array main.c device.c check_array.c) target_link_libraries(check_array ${CHECK_LIBRARIES} gpuarray) add_test(test_array "${CMAKE_CURRENT_BINARY_DIR}/check_array") +add_executable(check_blas main.c device.c check_blas.c) +target_link_libraries(check_blas ${CHECK_LIBRARIES} gpuarray) +add_test(test_blas "${CMAKE_CURRENT_BINARY_DIR}/check_blas") + add_executable(check_elemwise main.c device.c check_elemwise.c) target_link_libraries(check_elemwise ${CHECK_LIBRARIES} gpuarray) add_test(test_elemwise "${CMAKE_CURRENT_BINARY_DIR}/check_elemwise") diff --git a/tests/check_blas.c b/tests/check_blas.c new file mode 100644 index 0000000000..99098fc8d0 --- /dev/null +++ b/tests/check_blas.c @@ -0,0 +1,40 @@ +#include + +#include + +#include "gpuarray/array.h" +#include "gpuarray/blas.h" +#include "gpuarray/error.h" +#include "gpuarray/types.h" + +extern void *ctx; + +void setup(void); +void teardown(void); + +#define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) + +START_TEST(test_gemmBatch_3d) { + GpuArray A; + GpuArray B; + GpuArray C; + + size_t dims[3] = {32, 32, 32}; + + ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + + ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 1)); +} +END_TEST + +Suite *get_suite(void) { + Suite *s = suite_create("blas"); + TCase *tc = tcase_create("all"); + tcase_add_checked_fixture(tc, setup, teardown); + tcase_set_timeout(tc, 16.0); + tcase_add_test(tc, test_gemmBatch_3d); + suite_add_tcase(s, tc); + return s; +} From f83ea885301e58e3d98276c2dd88ab5785f56432 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 28 Nov 2016 15:22:29 -0500 Subject: [PATCH 125/597] Add a version tag for pygpu that you can introspect. --- .gitignore | 4 +--- pygpu/__init__.py | 2 ++ setup.py | 17 ++++++++++++++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 1bc8dee8c7..fe902dc679 100644 --- a/.gitignore +++ b/.gitignore @@ -16,11 +16,9 @@ distribute*egg distribute*tar.gz *.so *.o -*.aux -*.bbl -*.blg *.log pygpu/*.c pygpu/*.h +pygpu/version.py src/private_config.h Makefile.conf diff --git a/pygpu/__init__.py b/pygpu/__init__.py index 3ef62bc746..74479d3c68 100644 --- a/pygpu/__init__.py +++ b/pygpu/__init__.py @@ -12,6 +12,8 @@ def get_include(): concatenate, hstack, vstack, dstack) from ._array import ndgpuarray +from .version import fullversion as __version__ + from .tests import main if hasattr(main, "NoseTester"): test = main.NoseTester().test diff --git a/setup.py b/setup.py index 2e1f85878c..105461deb7 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,12 @@ have_cython = False +MAJOR = 0 +MINOR = 6 +PATCH = 0 +SUFFIX = 'rc1' +FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) + try: import Cython if Cython.__version__ < '0.21': @@ -81,6 +87,15 @@ def __init__(self, *args, **kwargs): raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode') library_dirs += [default_bin_dir] +with open('pygpu/version.py', 'w') as f: + f.write(""" +# File generated by setup.py +major = %d +minor = %d +patch = %d +suffix = "%s" +fullversion = "%s" +""" % (MAJOR, MINOR, PATCH, SUFFIX, FULLVERSION)) exts = [Extension('pygpu.gpuarray', sources=['pygpu/gpuarray.pyx'], @@ -112,7 +127,7 @@ def __init__(self, *args, **kwargs): )] setup(name='pygpu', - version='0.2.1', + version=FULLVERSION, description='numpy-like wrapper on libgpuarray for GPU computations', packages=['pygpu', 'pygpu/tests'], data_files=[('pygpu', ['pygpu/gpuarray.h', 'pygpu/gpuarray_api.h', From 7dd5de48719b0e59251e40a07429df8d41614435 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 28 Nov 2016 15:50:59 -0500 Subject: [PATCH 126/597] Remove unnecessary statics. --- tests/check_array.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/check_array.c b/tests/check_array.c index 721c156df9..d60181b943 100644 --- a/tests/check_array.c +++ b/tests/check_array.c @@ -21,12 +21,12 @@ START_TEST(test_take1_ok) { GpuArray v; GpuArray vidx; GpuArray vres; - static const uint32_t data[24] = { 0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23}; + const uint32_t data[24] = { 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23}; uint32_t buf[12 * 24]; - static const size_t data_dims[1] = {24}; + const size_t data_dims[1] = {24}; ssize_t indexes[12]; size_t dims[3]; From 45fd9090fbfa607724c1501b7d2a590c669d91a1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 28 Nov 2016 15:51:20 -0500 Subject: [PATCH 127/597] Add a test for the offset case. --- tests/check_array.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/check_array.c b/tests/check_array.c index d60181b943..6d3aaf3cd6 100644 --- a/tests/check_array.c +++ b/tests/check_array.c @@ -243,12 +243,40 @@ START_TEST(test_take1_ok) { } END_TEST +START_TEST(test_take1_offset) { + const uint32_t data[24] = {0, 1, 2, 3}; + const size_t data_dims[1] = {4}; + const size_t out_dims[1] = {2}; + const uint32_t idx[4] = {20, 3, 3, 2}; + GpuArray v; + GpuArray i; + GpuArray r; + + ga_assert_ok(GpuArray_empty(&v, ctx, GA_UINT, 1, data_dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_write(&v, data, sizeof(data))); + + ga_assert_ok(GpuArray_empty(&i, ctx, GA_UINT, 1, data_dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_write(&i, idx, sizeof(idx))); + + ga_assert_ok(GpuArray_empty(&r, ctx, GA_UINT, 1, out_dims, GA_C_ORDER)); + + /* Fake subtensor for offset */ + i.offset = 8; + i.dimensions[0] = 2; + + ga_assert_ok(GpuArray_take1(&r, &v, &i, 1)); + /* The actual results are not important, this is just to check that + we don't trigger the out of bounds check */ +} +END_TEST + Suite *get_suite(void) { Suite *s = suite_create("array"); TCase *tc = tcase_create("take1"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 8.0); tcase_add_test(tc, test_take1_ok); + tcase_add_test(tc, test_take1_offset); suite_add_tcase(s, tc); return s; } From 0b8cf5b7fcfd593290f975176a5ee6ea0816fd13 Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 06:50:34 -0500 Subject: [PATCH 128/597] fall back to size_t for strides --- src/gpuarray_blas_cuda_cublas.c | 78 +++++++++++++++--------------- src/gpuarray_blas_opencl_clblas.c | 60 +++++++++++------------ src/gpuarray_blas_opencl_clblast.c | 66 ++++++++++++------------- src/private.h | 60 +++++++++++------------ 4 files changed, 132 insertions(+), 132 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 41e6e372c0..9e81805a15 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -39,8 +39,8 @@ typedef struct _blas_handle { static const char *code_sgemvBH_N_a1_b1_small = \ "extern \"C\"__global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], int incx, " \ - " float *y[], int incy, " \ + " const float *x[], size_t incx, " \ + " float *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -62,8 +62,8 @@ static const char *code_sgemvBH_N_a1_b1_small = \ static const char *code_sgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], int incx, " \ - " float *y[], int incy, " \ + " const float *x[], size_t incx, " \ + " float *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -95,8 +95,8 @@ static const char *atomicadd_double = \ static const char *code_dgemvBH_N_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], int incx, " \ - " double *y[], int incy, " \ + " const double *x[], size_t incx, " \ + " double *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ @@ -118,8 +118,8 @@ static const char *code_dgemvBH_N_a1_b1_small = \ static const char *code_dgemvBH_T_a1_b1_small = \ "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], int incx, " \ - " double *y[], int incy, " \ + " const double *x[], size_t incx, " \ + " double *y[], size_t incy, " \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ @@ -137,8 +137,8 @@ static const char *code_dgemvBH_T_a1_b1_small = \ static const char *code_sgerBH_gen_small = \ "extern \"C\" __global__ void _sgerBH_gen_small(" \ - " const float *x[], int incx," \ - " const float *y[], int incy," \ + " const float *x[], size_t incx," \ + " const float *y[], size_t incy," \ " float alpha, float *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -152,8 +152,8 @@ static const char *code_sgerBH_gen_small = \ static const char *code_dgerBH_gen_small = \ "extern \"C\" __global__ void _dgerBH_gen_small(" \ - " const double *x[], int incx, " \ - " const double *y[], int incy," \ + " const double *x[], size_t incx, " \ + " const double *y[], size_t incy," \ " double alpha, double *A[], size_t lda," \ " size_t b, size_t m, size_t n) {" \ " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ @@ -788,16 +788,16 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -836,8 +836,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -876,15 +876,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, - float beta, gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, + float beta, gpudata *Y, size_t offY, size_t incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, - float beta, gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, + float beta, gpudata *Y, size_t offY, size_t incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -938,8 +938,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, - double beta, gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, + double beta, gpudata *Y, size_t offY, size_t incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -994,8 +994,8 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -1003,8 +1003,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { /* Flags is there for possible future implementations where we might not use atomics or have some alternate implemntation. */ @@ -1129,8 +1129,8 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; @@ -1251,13 +1251,13 @@ static int dgemvBatch(cb_order order, cb_transpose transA, static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, int incX, gpudata *Y, size_t offY, int incY, + size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, int incX, gpudata *Y, size_t offY, int incY, + size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -1314,7 +1314,7 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, } static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, - size_t offX, int incX, gpudata *Y, size_t offY, int incY, + size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -1371,16 +1371,16 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; @@ -1511,8 +1511,8 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 5cb926c61b..228fed8d41 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -147,8 +147,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -156,8 +156,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -165,31 +165,31 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; @@ -197,16 +197,16 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, static int hdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { return GA_DEVSUP_ERROR; } static int sdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; @@ -247,8 +247,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; @@ -288,15 +288,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -325,8 +325,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, double beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, double beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -424,15 +424,15 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; @@ -460,8 +460,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 0827f936e7..0d220ac322 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -141,8 +141,8 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, static int hgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -150,8 +150,8 @@ static int hgemvBatch(cb_order order, cb_transpose transA, static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } @@ -159,31 +159,31 @@ static int sgemvBatch(cb_order order, cb_transpose transA, static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; } static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { return GA_DEVSUP_ERROR; @@ -191,8 +191,8 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, static int hdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; StatusCode err; @@ -222,8 +222,8 @@ static int hdot( static int sdot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; StatusCode err; @@ -253,8 +253,8 @@ static int sdot( static int ddot( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; StatusCode err; @@ -284,8 +284,8 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -311,8 +311,8 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -338,8 +338,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, double beta, - gpudata *Y, size_t offY, int incY) { + gpudata *X, size_t offX, size_t incX, double beta, + gpudata *Y, size_t offY, size_t incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -448,8 +448,8 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -474,8 +474,8 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -500,8 +500,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -529,9 +529,9 @@ GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { setup, teardown, error, - hdot, /* TODO */ - sdot, /* TODO */ - ddot, /* TODO */ + hdot, + sdot, + ddot, hgemv, sgemv, dgemv, diff --git a/src/private.h b/src/private.h index ed8ce63293..bdc3f7bb9d 100644 --- a/src/private.h +++ b/src/private.h @@ -115,29 +115,29 @@ struct _gpuarray_blas_ops { const char *(*error)(gpucontext *ctx); int (*hdot)( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*sdot)( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*ddot)( size_t N, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY); + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY); int (*sgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY); + gpudata *X, size_t offX, size_t incX, float beta, + gpudata *Y, size_t offY, size_t incY); int (*dgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, double beta, - gpudata *Y, size_t offY, int incY); + gpudata *X, size_t offX, size_t incX, double beta, + gpudata *Y, size_t offY, size_t incY); int (*hgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, @@ -154,16 +154,16 @@ struct _gpuarray_blas_ops { gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc); int (*hger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda); int (*sger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda); int (*dger)(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, + gpudata *X, size_t offX, size_t incX, + gpudata *Y, size_t offY, size_t incY, gpudata *A, size_t offA, size_t lda); int (*hgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, @@ -186,34 +186,34 @@ struct _gpuarray_blas_ops { int (*hgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*sgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - float beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*dgemvBatch)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, int incX, - double beta, gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags); int (*hgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*sgerBatch)(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); int (*dgerBatch)(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, int incX, - gpudata **y, size_t *offY, int incY, + gpudata **x, size_t *offX, size_t incX, + gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); }; From 3bf6a4144c6e788c5f47a6ca214ac95206d651fd Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 07:08:01 -0500 Subject: [PATCH 129/597] now use buffer_alloc to create working buffer --- src/gpuarray_blas_opencl_clblas.c | 36 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 228fed8d41..f3c94a7741 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -7,6 +7,8 @@ #include "gpuarray/buffer_blas.h" #include "gpuarray/error.h" +extern const gpuarray_buffer_ops opencl_ops; + static inline clblasOrder convO(cb_order order) { switch (order) { case cb_row: @@ -210,16 +212,17 @@ static int sdot( gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; - cl_int cl_err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; - cl_mem scratch_mem; + gpudata *wbuf; + int alloc_err; - scratch_mem = clCreateBuffer( - ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(float), NULL, &cl_err); - if (cl_err != CL_SUCCESS) - return GA_MEMORY_ERROR; + wbuf = opencl_ops.buffer_alloc( + (gpucontext*)ctx, + N*sizeof(float), NULL, GA_BUFFER_READ_WRITE, &alloc_err); + if (alloc_err != GA_NO_ERROR) + return alloc_err; ARRAY_INIT(X); ARRAY_INIT(Y); @@ -230,7 +233,7 @@ static int sdot( N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, - scratch_mem, 1, &ctx->q, + wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; @@ -239,7 +242,7 @@ static int sdot( ARRAY_FINI(Y); ARRAY_FINI(Z); - clReleaseMemObject(scratch_mem); + opencl_ops.buffer_release(wbuf); clReleaseEvent(ev); return GA_NO_ERROR; @@ -252,16 +255,17 @@ static int ddot( gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; - cl_int cl_err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; - cl_mem scratch_mem; + gpudata *wbuf; + int alloc_err; - scratch_mem = clCreateBuffer( - ctx->ctx, CL_MEM_READ_WRITE, N*sizeof(double), NULL, &cl_err); - if (cl_err != CL_SUCCESS) - return GA_MEMORY_ERROR; + wbuf = opencl_ops.buffer_alloc( + (gpucontext*)ctx, + N*sizeof(double), NULL, GA_BUFFER_READ_WRITE, &alloc_err); + if (alloc_err != GA_NO_ERROR) + return alloc_err; ARRAY_INIT(X); ARRAY_INIT(Y); @@ -271,7 +275,7 @@ static int ddot( N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, - scratch_mem, 1, &ctx->q, + wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; @@ -280,7 +284,7 @@ static int ddot( ARRAY_FINI(Y); ARRAY_FINI(Z); - clReleaseMemObject(scratch_mem); + opencl_ops.buffer_release(wbuf); clReleaseEvent(ev); return GA_NO_ERROR; From c74e8e6041278cbef473aa675f6c8220484ddeca Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 08:06:21 -0500 Subject: [PATCH 130/597] revert old int strides --- src/gpuarray_blas_cuda_cublas.c | 18 +++++++++--------- src/gpuarray_blas_opencl_clblas.c | 24 ++++++++++++------------ src/gpuarray_blas_opencl_clblast.c | 24 ++++++++++++------------ 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 9e81805a15..39cba704f5 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -876,15 +876,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, - float beta, gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, + float beta, gpudata *Y, size_t offY, int incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, - float beta, gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, + float beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -938,8 +938,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, - double beta, gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, + double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; @@ -1251,13 +1251,13 @@ static int dgemvBatch(cb_order order, cb_transpose transA, static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; @@ -1314,7 +1314,7 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, } static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, - size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, + size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index f3c94a7741..d6e58ecbfd 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -292,15 +292,15 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { return GA_DEVSUP_ERROR; } static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -329,8 +329,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, double beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; @@ -428,15 +428,15 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { return GA_DEVSUP_ERROR; } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; @@ -464,8 +464,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event evl[3]; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 0d220ac322..78cca10f20 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -284,8 +284,8 @@ static int ddot( static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -311,8 +311,8 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -338,8 +338,8 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, double beta, - gpudata *Y, size_t offY, size_t incY) { + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; @@ -448,8 +448,8 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, } static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -474,8 +474,8 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, } static int sger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; @@ -500,8 +500,8 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, } static int dger(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; From d6c69b64a9434249812194e7eeadc58546b074ec Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 08:13:31 -0500 Subject: [PATCH 131/597] revert strides in private.h --- src/private.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/private.h b/src/private.h index bdc3f7bb9d..57d919be88 100644 --- a/src/private.h +++ b/src/private.h @@ -128,16 +128,16 @@ struct _gpuarray_blas_ops { gpudata *Z, size_t offZ); int (*hgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY); + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY); int (*sgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, float beta, - gpudata *Y, size_t offY, size_t incY); + gpudata *X, size_t offX, int incX, float beta, + gpudata *Y, size_t offY, int incY); int (*dgemv)(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, size_t incX, double beta, - gpudata *Y, size_t offY, size_t incY); + gpudata *X, size_t offX, int incX, double beta, + gpudata *Y, size_t offY, int incY); int (*hgemm)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, @@ -154,16 +154,16 @@ struct _gpuarray_blas_ops { gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc); int (*hger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*sger)(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*dger)(cb_order order, size_t M, size_t N, double alpha, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, + gpudata *X, size_t offX, int incX, + gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda); int (*hgemmBatch)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, From cae367115ae35f540dd62580ca8418e32ddff60b Mon Sep 17 00:00:00 2001 From: khaotik Date: Tue, 29 Nov 2016 08:23:26 -0500 Subject: [PATCH 132/597] mini cleanup --- pygpu/tests/test_blas.py | 1 - src/gpuarray_blas_opencl_clblas.c | 1 - 2 files changed, 2 deletions(-) diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 6b24ceea5d..8ce7d7aebe 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -58,7 +58,6 @@ def test_gemv(): yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ overwrite, True, alpha, beta - @guard_devsup def gemv(shp, dtype, order, trans, offseted_i, sliced, overwrite, init_y, alpha=1.0, beta=0.0): diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index d6e58ecbfd..2041710735 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -1,7 +1,6 @@ #include "private.h" #include "private_opencl.h" -#include "loaders/libopencl.h" #include "loaders/libclblas.h" #include "gpuarray/buffer_blas.h" From 2d22306a56770fc8be95b8f33483f22d6462758f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 29 Nov 2016 12:25:57 -0500 Subject: [PATCH 133/597] Fix leak in error path for GA_CTX_PROP_DEVNAME --- src/gpuarray_buffer_cuda.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index d5c1111733..3085d112a8 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1393,6 +1393,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, } ctx->err = cuDeviceGetName(s, 256, id); if (ctx->err != CUDA_SUCCESS) { + free(s); cuda_exit(ctx); return GA_IMPL_ERROR; } @@ -1414,8 +1415,6 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, } ctx->err = cuDeviceGetPCIBusId(s, 13, id); if (ctx->err != CUDA_SUCCESS) { - /* PS: in GA_CTX_PROP_DEVNAME above, s is not freed here. - * I think it should be freed, isn't it ? */ free(s); cuda_exit(ctx); return GA_IMPL_ERROR; From c18bdafefebdd750059e17d23fc914984a1d2f6b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 29 Nov 2016 12:55:44 -0500 Subject: [PATCH 134/597] Add a way to query the largest allocatable size. --- src/gpuarray/buffer.h | 7 +++++++ src/gpuarray_buffer_cuda.c | 19 +++++++++++++++++++ src/gpuarray_buffer_opencl.c | 5 +++-- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index bd33e9f69c..aedb669d1e 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -689,6 +689,13 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); */ #define GA_CTX_PROP_PCIBUSID 19 +/** + * Get the largest single block of memory that can be allocted. + * + * Type: `size_t` + */ +#define GA_CTX_PROP_LARGEST_MEMBLOCK 20 + /* Start at 512 for GA_BUFFER_PROP_ */ #define GA_BUFFER_PROP_START 512 diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 3085d112a8..9e76475448 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -443,6 +443,21 @@ static void find_best(cuda_context *ctx, gpudata **best, gpudata **prev, } } +static size_t largest_size(cuda_context *ctx) { + gpudata *temp; + size_t sz, dummy; + cuda_enter(ctx); + ctx->err = cuMemGetInfo(&sz, &dummy); + cuda_exit(ctx); + /* We guess that we can allocate at least a quarter of the free size + in a single block. This might be wrong though. */ + sz /= 4; + for (temp = ctx->freeblocks; temp; temp = temp->next) { + if (temp->sz > sz) sz = temp->sz; + } + return sz; +} + /* * Allocate a new block and place in on the freelist. Will allocate * the bigger of the requested size and BLOCK_SIZE to avoid allocating @@ -1423,6 +1438,10 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, cuda_exit(ctx); return GA_NO_ERROR; + case GA_CTX_PROP_LARGEST_MEMBLOCK: + *((size_t *)res) = largest_size(ctx); + return GA_NO_ERROR; + case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 4b3f5fbfe2..2afa6e962d 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1292,12 +1292,13 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_FREE_GMEM: + /* There is no way to query free memory so we just return the + largest block size */ + case GA_CTX_PROP_LARGEST_MEMBLOCK: ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL); if (ctx->err != GA_NO_ERROR) return GA_IMPL_ERROR; - /* XXX: This is not exaclty the amount of free memory but there is - no way to query that in the OpenCL API. */ ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(sz), &sz, NULL); if (ctx->err != GA_NO_ERROR) From cd44245d4151d375d0b52ed540b5955686362c65 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 29 Nov 2016 11:06:30 -0800 Subject: [PATCH 135/597] Fix typo in take1 offset test. --- tests/check_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/check_array.c b/tests/check_array.c index 6d3aaf3cd6..59de293521 100644 --- a/tests/check_array.c +++ b/tests/check_array.c @@ -244,7 +244,7 @@ START_TEST(test_take1_ok) { END_TEST START_TEST(test_take1_offset) { - const uint32_t data[24] = {0, 1, 2, 3}; + const uint32_t data[4] = {0, 1, 2, 3}; const size_t data_dims[1] = {4}; const size_t out_dims[1] = {2}; const uint32_t idx[4] = {20, 3, 3, 2}; From bfab391dbea5567ba4a21b044da408dbe4815328 Mon Sep 17 00:00:00 2001 From: Ray Donnelly Date: Fri, 25 Nov 2016 12:40:56 +0000 Subject: [PATCH 136/597] Include stdint.h via gpuarray/config.h Because for Visual Studio < 2010, we use a bundled version instead and the compile-guards that implement that logic are in gpuarray/config.h --- src/util/integerfactoring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index e58763761e..cb785bb080 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -1,6 +1,7 @@ /* Includes */ #include #include +#include #include #include "integerfactoring.h" From ebf5edd3ac67a2bfc232c17e7f01ee4f7cdabc38 Mon Sep 17 00:00:00 2001 From: Ray Donnelly Date: Fri, 25 Nov 2016 12:53:15 +0000 Subject: [PATCH 137/597] Win32: Use -W4, not -Wall on MSVC --- CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6ec583a54..382c064e9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,13 @@ PROJECT(libgpuarray C) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") +# -Wall is unbelieveably noisy with Visual Studio: +# http://stackoverflow.com/q/4001736/3257826 +if(MSVC) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4") +else() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") +endif() enable_testing() From 6d1bdbdf9f6be0b272564b2040d83b158539e7cf Mon Sep 17 00:00:00 2001 From: Ray Donnelly Date: Fri, 25 Nov 2016 14:24:56 +0000 Subject: [PATCH 138/597] Win32: Actually print default_bin_dir in exception --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 105461deb7..767e4cf5f3 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ def __init__(self, *args, **kwargs): default_bin_dir = os.path.join(current_dir, 'lib', 'Release') if not os.path.isdir(default_bin_dir): - raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode') + raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir)) library_dirs += [default_bin_dir] with open('pygpu/version.py', 'w') as f: From 9226f57fc408b8125e3a2ae5add035515d74fef9 Mon Sep 17 00:00:00 2001 From: Ray Donnelly Date: Fri, 25 Nov 2016 14:25:29 +0000 Subject: [PATCH 139/597] Win32 + conda: Remove default binary dir hack --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 767e4cf5f3..f1150de5c9 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ def __init__(self, *args, **kwargs): include_dirs = [np.get_include()] library_dirs = [] -if sys.platform == 'win32': +if sys.platform == 'win32' and not os.getenv('CONDA_BUILD'): # This is a hack so users don't need to do many steps for windows install # Just use the default location. current_dir = os.path.abspath(os.path.dirname(__file__)) From 7a4af36d6712b20604704f243e1d2ef95da4e9c4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 29 Nov 2016 15:25:08 -0500 Subject: [PATCH 140/597] Bump the SOVERSION --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 622146508c..ec560d217a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version - VERSION 0.0 + VERSION 0.1 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) From de0ab99bc2e47232f481ece2672e7555c8099baf Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 29 Nov 2016 15:35:22 -0500 Subject: [PATCH 141/597] Move the loading of libcublas to when you fetch the blas ops. --- src/gpuarray_blas_cuda_cublas.c | 5 ----- src/gpuarray_buffer_cuda.c | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 8d6ac5ed53..2acabff154 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -172,16 +172,11 @@ static int setup(gpucontext *c) { blas_handle *handle; const char *tmp[2]; cublasStatus_t err; - int e; int types[10]; if (ctx->blas_handle != NULL) return GA_NO_ERROR; - e = load_libcublas(ctx->major, ctx->minor); - if (e != GA_NO_ERROR) - return e; - handle = calloc(1, sizeof(*handle)); if (handle == NULL) return GA_MEMORY_ERROR; diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 9e76475448..736ce31e1d 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -3,6 +3,7 @@ #include "private.h" #include "private_cuda.h" #include "loaders/libnvrtc.h" +#include "loaders/libcublas.h" #include @@ -1512,6 +1513,11 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: + { + int e = load_libcublas(major, minor); + if (e != GA_NO_ERROR) + return e; + } *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; From 9f789689efa5a81b12234fb624301d7e86065c45 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 30 Nov 2016 19:06:24 -0500 Subject: [PATCH 142/597] Fix compile error. --- src/gpuarray_blas_cuda_cublas.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 2acabff154..cd1a0c053f 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -173,6 +173,7 @@ static int setup(gpucontext *c) { const char *tmp[2]; cublasStatus_t err; int types[10]; + int e; if (ctx->blas_handle != NULL) return GA_NO_ERROR; From 3002f622c4accaa34504bc1aaf42d78b4b7b7eaf Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 30 Nov 2016 19:24:02 -0500 Subject: [PATCH 143/597] Add an API version since that is a completely different notion to the ABI version. --- src/gpuarray/config.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index 5e43074d79..313044c01f 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -1,6 +1,8 @@ #ifndef GPUARRAY_CONFIG #define GPUARRAY_CONFIG +#define GPUARRAY_API_VERSION 0 + #ifdef GPUARRAY_SHARED #ifdef _WIN32 #ifdef GPUARRAY_BUILDING_DLL From 73f617e22192d9f037541bf898d4609ed912b3eb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 30 Nov 2016 19:35:07 -0500 Subject: [PATCH 144/597] Excpose the API version with the existing api_version() function. --- pygpu/gpuarray.pxd | 3 +++ pygpu/gpuarray.pyx | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index c205d8f484..070eef96e2 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -21,6 +21,9 @@ cdef extern from "Python.h": Py_ssize_t *step, Py_ssize_t *slicelength) except -1 +cdef extern from "gpuarray/config.h": + int GPUARRAY_API_VERSION + cdef extern from "gpuarray/types.h": ctypedef struct gpuarray_type: const char *cluda_name diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 8deb2d279a..bb840ea8b2 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -10,8 +10,8 @@ from cpython cimport Py_INCREF, PyNumber_Index from cpython.object cimport Py_EQ, Py_NE def api_version(): - # Those where the last defined numbers. - return (-9997, 1, 0) + # (library version, module version) + return (GPUARRAY_API_VERSION, 0) np.import_array() From 588228903401fa4194fba3cd6e528f01692b70b1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 30 Nov 2016 19:56:35 -0500 Subject: [PATCH 145/597] Add an object dictionary to GpuContext. --- pygpu/gpuarray.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 070eef96e2..629029fe08 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -321,8 +321,10 @@ cdef api GpuArray pygpu_concatenate(const _GpuArray **a, size_t n, object cls, GpuContext context) cdef api class GpuContext [type PyGpuContextType, object PyGpuContextObject]: + cdef dict __dict__ cdef gpucontext* ctx cdef readonly bytes kind + cdef object __weakref__ cdef GpuArray new_GpuArray(object cls, GpuContext ctx, object base) From 1732459da2b469a29c9eda55bbd35195531a18d9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 1 Dec 2016 10:57:53 -0500 Subject: [PATCH 146/597] Expose the LARGEST_MEMBLOCK property in python. --- pygpu/gpuarray.pxd | 1 + pygpu/gpuarray.pyx | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 629029fe08..29193ae9fb 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -103,6 +103,7 @@ cdef extern from "gpuarray/buffer.h": int GA_CTX_PROP_MAXGSIZE0 int GA_CTX_PROP_MAXGSIZE1 int GA_CTX_PROP_MAXGSIZE2 + int GA_CTX_PROP_LARGEST_MEMBLOCK int GA_BUFFER_PROP_SIZE diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index bb840ea8b2..7b05152a43 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1146,6 +1146,13 @@ cdef class GpuContext: ctx_property(self, GA_CTX_PROP_MAXGSIZE2, &res) return res + property largest_memblock: + "Size of the largest memory block you can allocate" + def __get__(self): + cdef size_t res + ctx_property(self, GA_CTX_PROP_LARGEST_MEMBLOCK, &res) + return res + cdef class flags(object): cdef int fl From 637783aea48a2c5d60b2192c34fd5b2f3f83bd01 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 1 Dec 2016 11:19:42 -0500 Subject: [PATCH 147/597] Get rid of most of the cython C code build warnings. --- pygpu/gpuarray.pxd | 2 +- pygpu/gpuarray.pyx | 21 +++++++++++---------- setup.py | 9 +++++++++ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 29193ae9fb..260a4b44bc 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -16,7 +16,7 @@ cdef extern from "numpy/arrayobject.h": cdef object PyArray_Empty(int a, np.npy_intp *b, np.dtype c, int d) cdef extern from "Python.h": - int PySlice_GetIndicesEx(slice_object slice, Py_ssize_t length, + int PySlice_GetIndicesEx(object slice, Py_ssize_t length, Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step, Py_ssize_t *slicelength) except -1 diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 7b05152a43..2d2862fc7d 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -235,7 +235,7 @@ cdef int strides_ok(GpuArray a, strides): return 0 upper += max_axis_offset else: - if lower < -max_axis_offset: + if lower < (-max_axis_offset): return 0 lower += max_axis_offset return (upper + itemsize) <= size @@ -874,7 +874,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None, free(cdims) free(cstrides) -def array(proto, dtype=None, copy=True, order=None, int ndmin=0, +def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0, GpuContext context=None, cls=None): """ array(obj, dtype='float64', copy=True, order=None, ndmin=0, context=None, cls=None) @@ -890,7 +890,7 @@ def array(proto, dtype=None, copy=True, order=None, int ndmin=0, :param order: memory layout of the result :type order: string :param ndmin: minimum number of result dimensions - :type ndmin: int + :type ndmin: unsigned int :param context: allocation context :type context: GpuContext :param cls: result class (must inherit from GpuArray) @@ -1384,21 +1384,24 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, if compute_axis < 0: array_reshape(res, a, nd, newdims, ord, nocopy) return res - if compute_axis >= nd: + cdef unsigned int caxis = compute_axis + if caxis >= nd: raise ValueError("You wanted us to compute the shape of a dimensions that don't exist") cdef size_t *cdims cdef size_t tot = 1 + cdef unsigned int i for i in range(nd): - if i != compute_axis: + if i != caxis: tot *= newdims[i] cdims = calloc(nd, sizeof(size_t)) if cdims == NULL: raise MemoryError, "could not allocate cdims" + cdef size_t d for i in range(nd): d = newdims[i] - if i == compute_axis: + if i == caxis: d = a.size // tot if d * tot != a.size: @@ -1537,7 +1540,7 @@ cdef class GpuArray: k = PyNumber_Index(key) if k < 0: k += self.ga.dimensions[i] - if k < 0 or k >= self.ga.dimensions[i]: + if k < 0 or (k) >= self.ga.dimensions[i]: raise IndexError, "index %d out of bounds" % (i,) start[0] = k step[0] = 0 @@ -1546,9 +1549,7 @@ cdef class GpuArray: pass if isinstance(key, slice): - # C compiler complains about argument 1 (key) because it's - # declared as a PyObject. But we know it's a slice so it's ok. - PySlice_GetIndicesEx(key, self.ga.dimensions[i], + PySlice_GetIndicesEx(key, self.ga.dimensions[i], start, stop, step, &dummy) if stop[0] < start[0] and step[0] > 0: stop[0] = start[0] diff --git a/setup.py b/setup.py index 105461deb7..3d73ca364b 100644 --- a/setup.py +++ b/setup.py @@ -97,11 +97,17 @@ def __init__(self, *args, **kwargs): fullversion = "%s" """ % (MAJOR, MINOR, PATCH, SUFFIX, FULLVERSION)) +ea = [] +if sys.platform in ('darwin', 'linux'): + # Silence unused stuff warnings + ea = ["-Wno-unused-variable", "-Wno-unused-function"] + exts = [Extension('pygpu.gpuarray', sources=['pygpu/gpuarray.pyx'], include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, + extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] ), Extension('pygpu.blas', @@ -109,6 +115,7 @@ def __init__(self, *args, **kwargs): include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, + extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] ), Extension('pygpu._elemwise', @@ -116,6 +123,7 @@ def __init__(self, *args, **kwargs): include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, + extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] ), Extension('pygpu.collectives', @@ -123,6 +131,7 @@ def __init__(self, *args, **kwargs): include_dirs=include_dirs, libraries=['gpuarray'], library_dirs=library_dirs, + extra_compile_args=ea, define_macros=[('GPUARRAY_SHARED', None)] )] From ef5046b00c0b3a149ebb5c47a22c8f82dee7e2b9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 2 Dec 2016 11:37:39 -0500 Subject: [PATCH 148/597] Bump the required version of cython for the dict change. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3d73ca364b..7e2b2e2f03 100644 --- a/setup.py +++ b/setup.py @@ -11,9 +11,9 @@ try: import Cython - if Cython.__version__ < '0.21': + if Cython.__version__ < '0.25': raise Exception('cython is too old or not installed ' - '(at least 0.21 required)') + '(at least 0.25 required)') from Cython.Build import cythonize have_cython = True except Exception: From 22504ebe4a64dc4f9346e1e174941b029ff2f3ab Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 2 Dec 2016 11:40:42 -0500 Subject: [PATCH 149/597] Also bump requirement in the docs. --- doc/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/installation.rst b/doc/installation.rst index add2fbe675..125697169d 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -21,7 +21,7 @@ Build Requirements - (optional) libcheck (check_) to run the C tests. - (optional) python (python_) for the python bindings. - (optional) mako (mako_) for development or running the python bindings. - - (optional) Cython >= 0.21 (cython_) for the python bindings. + - (optional) Cython >= 0.25 (cython_) for the python bindings. - (optional) nosetests (nosetests_) to run the python tests. Run Requirements From adfb8bfbe7e85509d43ccbd6833916691d79486b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 2 Dec 2016 15:37:05 -0500 Subject: [PATCH 150/597] Speedup the cython implementation of setitem. --- pygpu/gpuarray.pxd | 1 + pygpu/gpuarray.pyx | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 260a4b44bc..2f4b5788f3 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -337,6 +337,7 @@ cdef api class GpuArray [type PyGpuArrayType, object PyGpuArrayObject]: cdef __index_helper(self, key, unsigned int i, ssize_t *start, ssize_t *stop, ssize_t *step) + cdef __cgetitem__(self, idx) cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]: cdef _GpuKernel k diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 2d2862fc7d..51fa9da0ec 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -907,6 +907,10 @@ def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0, This function is similar to :meth:`numpy.array` except that it returns GpuArrays. """ + return carray(proto, dtype, copy, order, ndmin, context, cls) + +cdef carray(proto, dtype, copy, order, unsigned int ndmin, + GpuContext context, cls): cdef GpuArray res cdef GpuArray arg cdef GpuArray tmp @@ -1823,6 +1827,9 @@ cdef class GpuArray: raise TypeError, "len() of unsized object" def __getitem__(self, key): + return self.__cgetitem__(key) + + cdef __cgetitem__(self, key): cdef ssize_t *starts cdef ssize_t *stops cdef ssize_t *steps @@ -1886,9 +1893,9 @@ cdef class GpuArray: free(steps) def __setitem__(self, idx, v): - cdef GpuArray tmp = self.__getitem__(idx) - cdef GpuArray gv = asarray(v, dtype=self.dtype, - context=self.context) + cdef GpuArray tmp = self.__cgetitem__(idx) + cdef GpuArray gv = carray(v, self.ga.typecode, False, 'A', 0, + self.context, GpuArray) array_setarray(tmp, gv) From fe392cf116bae9768e72f91d18c08b02ad7bf58e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Sat, 10 Dec 2016 09:24:03 +0100 Subject: [PATCH 151/597] Stop returning allocated memory from the library to avoid problems on windows. --- pygpu/gpuarray.pyx | 22 ++++++---------------- src/gpuarray/buffer.h | 12 +++++------- src/gpuarray_buffer_cuda.c | 32 ++++---------------------------- src/gpuarray_buffer_opencl.c | 17 ++--------------- 4 files changed, 17 insertions(+), 66 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 51fa9da0ec..83acb4084f 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1036,28 +1036,18 @@ cdef class GpuContext: property devname: "Device name for this context" def __get__(self): - cdef char *tmp - cdef unicode res + cdef char tmp[256] - ctx_property(self, GA_CTX_PROP_DEVNAME, &tmp) - try: - res = tmp.decode('ascii') - finally: - free(tmp) - return res + ctx_property(self, GA_CTX_PROP_DEVNAME, tmp) + return tmp.decode('ascii') property pcibusid: "Device PCI Bus ID for this context" def __get__(self): - cdef char *tmp - cdef unicode res + cdef char tmp[16] - ctx_property(self, GA_CTX_PROP_PCIBUSID, &tmp) - try: - res = tmp.decode('ascii') - finally: - free(tmp) - return res + ctx_property(self, GA_CTX_PROP_PCIBUSID, tmp) + return tmp.decode('ascii') property maxlsize: "Maximum size of thread block (local size) for this context" diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index aedb669d1e..fb8970781a 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -494,7 +494,9 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, size_t shared, void **args); /** - * Get the kernel binary. + * (Deprecated) Get the kernel binary. + * + * This function is deprecated and will be removed in the next release. * * This can be use to cache kernel binaries after compilation of a * specific device. The kernel can be recreated by calling @@ -537,9 +539,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); /** * Get the device name for the context. * - * \note The returned string is allocated and must be freed by the caller. - * - * Type: `char *` + * Type: `char [256]` */ #define GA_CTX_PROP_DEVNAME 1 @@ -683,9 +683,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); /** * Get the device PCI Bus ID for the context. * - * \note The returned string is allocated and must be freed by the caller. - * - * Type: `char *` + * Type: `char [16]` */ #define GA_CTX_PROP_PCIBUSID 19 diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 736ce31e1d..500eb147c1 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1389,7 +1389,6 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, } switch (prop_id) { - char *s; CUdevice id; int i; size_t sz; @@ -1401,21 +1400,9 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, cuda_exit(ctx); return GA_IMPL_ERROR; } - /* 256 is what the CUDA API uses so it's good enough for me */ - s = malloc(256); - if (s == NULL) { - cuda_exit(ctx); - return GA_MEMORY_ERROR; - } - ctx->err = cuDeviceGetName(s, 256, id); - if (ctx->err != CUDA_SUCCESS) { - free(s); - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - *((char **)res) = s; + ctx->err = cuDeviceGetName((char *)res, 256, id); cuda_exit(ctx); - return GA_NO_ERROR; + return (ctx->err != CUDA_SUCCESS) ? GA_IMPL_ERROR : GA_NO_ERROR; case GA_CTX_PROP_PCIBUSID: cuda_enter(ctx); @@ -1424,20 +1411,9 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, cuda_exit(ctx); return GA_IMPL_ERROR; } - s = malloc(13); - if (s == NULL) { - cuda_exit(ctx); - return GA_MEMORY_ERROR; - } - ctx->err = cuDeviceGetPCIBusId(s, 13, id); - if (ctx->err != CUDA_SUCCESS) { - free(s); - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - *((char **)res) = s; + ctx->err = cuDeviceGetPCIBusId((char *)res, 13, id); cuda_exit(ctx); - return GA_NO_ERROR; + return (ctx->err != CUDA_SUCCESS) ? GA_IMPL_ERROR : GA_NO_ERROR; case GA_CTX_PROP_LARGEST_MEMBLOCK: *((size_t *)res) = largest_size(ctx); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 2afa6e962d..f744084af3 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1157,7 +1157,6 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, } switch (prop_id) { - char *s; size_t sz; size_t *psz; cl_device_id id; @@ -1168,23 +1167,11 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, &id, NULL); if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_NAME, 0, NULL, &sz); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - s = malloc(sz); - if (s == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_NAME, sz, s, NULL); - if (ctx->err != CL_SUCCESS) { - free(s); - return GA_IMPL_ERROR; - } - *((char **)res) = s; - return GA_NO_ERROR; + ctx->err = clGetDeviceInfo(id, CL_DEVICE_NAME, 256, (char *)res, NULL); + return (ctx->err != CL_SUCCESS) ? GA_IMPL_ERROR : GA_NO_ERROR; case GA_CTX_PROP_PCIBUSID: /* For the moment, PCI Bus ID is not supported for OpenCL. */ - *((void **)res) = NULL; return GA_DEVSUP_ERROR; case GA_CTX_PROP_MAXLSIZE: From 29c8e1e40cc8d8d300297cdcae42731af457ad89 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Sun, 11 Dec 2016 15:24:55 +0100 Subject: [PATCH 152/597] Bump the shared library version. --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ec560d217a..88514ef683 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version - VERSION 0.1 + VERSION 1.0 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) From 051ee13a61a7c4ff8260a6b6275eb76639e04662 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 5 Jan 2017 14:20:01 -0500 Subject: [PATCH 153/597] Add a travis.yml that build the project. --- .travis.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..d188d7ac90 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,17 @@ +language: c + +compiler: + - clang + - gcc + +# Build with Debug and Release to flush out build problems +script: + - mkdir Debug + - cd Debug + - cmake .. -DCMAKE_BUILD_TYPE=Debug + - make + - cd .. + - mkdir Release + - cd Release + - cmake .. -DCMAKE_BUILD_TYPE=Release + - make From 892caede5963b173070d6129f3bbd76dcfeb2246 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 5 Jan 2017 14:28:44 -0500 Subject: [PATCH 154/597] Install cmake >= 3.0 --- .travis.yml | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index d188d7ac90..a583321fbb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,26 @@ language: c +addons: + apt: + sources: + - kalakris-cmake + - george-edison55-precise-backports + packages: + - cmake + - cmake-data + compiler: - clang - gcc # Build with Debug and Release to flush out build problems script: - - mkdir Debug - - cd Debug - - cmake .. -DCMAKE_BUILD_TYPE=Debug - - make - - cd .. - - mkdir Release - - cd Release - - cmake .. -DCMAKE_BUILD_TYPE=Release - - make + - mkdir Debug + - cd Debug + - cmake .. -DCMAKE_BUILD_TYPE=Debug + - make + - cd .. + - mkdir Release + - cd Release + - cmake .. -DCMAKE_BUILD_TYPE=Release + - make From ed1ac6a613a439b17058bc5292af468062607572 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 5 Jan 2017 14:36:13 -0500 Subject: [PATCH 155/597] Add testing on macOS. --- .travis.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.travis.yml b/.travis.yml index a583321fbb..6635b2cd43 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,9 @@ language: c +os: + - linux + - osx + addons: apt: sources: @@ -9,6 +13,9 @@ addons: - cmake - cmake-data +#before_install: +# - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install cmake; fi + compiler: - clang - gcc From 0bcf2984f265bd3de91ab720c1b3d121c65b8e91 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 5 Jan 2017 14:56:06 -0500 Subject: [PATCH 156/597] Don't try gcc on macOS. --- .travis.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6635b2cd43..6802500f08 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,13 @@ language: c -os: - - linux - - osx +matrix: + include: + - os: linux + compiler: gcc + - os: linux + compiler: clang + - os: osx + compiler: clang addons: apt: @@ -16,10 +21,6 @@ addons: #before_install: # - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install cmake; fi -compiler: - - clang - - gcc - # Build with Debug and Release to flush out build problems script: - mkdir Debug From 6c6c5bf44f70b29dc60974d7e90bc06ee4665435 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 9 Jan 2017 14:30:29 -0500 Subject: [PATCH 157/597] Add message for /usr/local for macOS. --- doc/installation.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/installation.rst b/doc/installation.rst index 125697169d..3894c7cf12 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -131,6 +131,11 @@ macOS. It might be possible to use a version of gcc built using Homebrew or MacPorts, but this is untested and unsupported. +It appears that on some versions, /usr/local is not in the default +compiler paths so you might need to add ``-L /usr/local/lib -I +/usr/local/include`` to the command line to build the python module. + + Windows-specific instructions ----------------------------- From 6ad7503f4bb2c022043c7ff1f799b7390bf7aaa5 Mon Sep 17 00:00:00 2001 From: J Rao Date: Wed, 11 Jan 2017 10:38:49 +0800 Subject: [PATCH 158/597] Fix vs2013 syntax error --- src/gpuarray/config.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index 313044c01f..2da491ff11 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -29,6 +29,8 @@ #include #if _MSC_VER < 1600 #include +#else +#include #endif #define ssize_t intptr_t #define SSIZE_MAX INTPTR_MAX From 61fe4798259d70f7a019176bf923b526ecb13157 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Wed, 11 Jan 2017 17:04:56 -0500 Subject: [PATCH 159/597] Add ABI version handling in C code and Python code (#323) * Add GPUARRAY_ABI_VERSION constant and pygpu.gpuarray.abi_version() method. A new header file (abi_version.h) auto-generated by CMake has been added. * Add auto-generated file `src/gpuarray/abi_version.h` to .gitignore. * Make GPUARRAY_ABI_VERSION a number (1000*major + minor) Change ABI minor version (1.0 -> 1.1) to make some tests. * Add auto-generated CMake SOVERSION variable (equals to the first part of VERSION). Now the built library file is named libgpuarray.so.SOVERSION, that is libgpuarray.so.ABIMAJOR. * Back to right current version. --- .gitignore | 1 + pygpu/gpuarray.pxd | 1 + pygpu/gpuarray.pyx | 5 +++++ src/CMakeLists.txt | 14 ++++++++++++++ src/gpuarray/config.h | 2 ++ 5 files changed, 23 insertions(+) diff --git a/.gitignore b/.gitignore index fe902dc679..164503e2ae 100644 --- a/.gitignore +++ b/.gitignore @@ -20,5 +20,6 @@ distribute*tar.gz pygpu/*.c pygpu/*.h pygpu/version.py +src/gpuarray/abi_version.h src/private_config.h Makefile.conf diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 2f4b5788f3..c305d1dcfe 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -23,6 +23,7 @@ cdef extern from "Python.h": cdef extern from "gpuarray/config.h": int GPUARRAY_API_VERSION + int GPUARRAY_ABI_VERSION cdef extern from "gpuarray/types.h": ctypedef struct gpuarray_type: diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 83acb4084f..ef2fb818e7 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -13,6 +13,11 @@ def api_version(): # (library version, module version) return (GPUARRAY_API_VERSION, 0) +def abi_version(): + major_version = GPUARRAY_ABI_VERSION / 1000 + minor_version = GPUARRAY_ABI_VERSION % 1000 + return (major_version, minor_version) + np.import_array() # to export the numeric value diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 88514ef683..029148050d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -96,6 +96,19 @@ add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) target_link_libraries(gpuarray ${CMAKE_DL_LIBS}) target_link_libraries(gpuarray-static ${CMAKE_DL_LIBS}) +# Generate gpuarray/abi_version.h that contains the ABI version number. +get_target_property(GPUARRAY_ABI_VERSION gpuarray VERSION) +string(REPLACE "." ";" GPUARRAY_ABI_VERSION_NUMBERS ${GPUARRAY_ABI_VERSION}) +list(GET GPUARRAY_ABI_VERSION_NUMBERS 0 GPUARRAY_ABI_VERSION_MAJOR) +list(GET GPUARRAY_ABI_VERSION_NUMBERS 1 GPUARRAY_ABI_VERSION_MINOR) +math(EXPR GPUARRAY_ABI_NUMBER "1000*${GPUARRAY_ABI_VERSION_MAJOR} + ${GPUARRAY_ABI_VERSION_MINOR}") +FILE(WRITE gpuarray/abi_version.h +"\#ifndef GPUARRAY_ABI_VERSION\n\#define GPUARRAY_ABI_VERSION ${GPUARRAY_ABI_NUMBER}\n\#endif\n" +) + +# set SOVERSION and ensure it is the first part of VERSION. +set_property(TARGET gpuarray PROPERTY SOVERSION ${GPUARRAY_ABI_VERSION_MAJOR}) + set(headers gpuarray/array.h gpuarray/blas.h @@ -103,6 +116,7 @@ set(headers gpuarray/buffer.h gpuarray/buffer_blas.h gpuarray/buffer_collectives.h + gpuarray/abi_version.h gpuarray/config.h gpuarray/elemwise.h gpuarray/error.h diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index 2da491ff11..4eb58b401d 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -1,6 +1,8 @@ #ifndef GPUARRAY_CONFIG #define GPUARRAY_CONFIG +/* The following included file should have been generated by CMake. */ +#include #define GPUARRAY_API_VERSION 0 #ifdef GPUARRAY_SHARED From 7d7bf1dbc005fd2a4d016f5f277b5f121c2d93da Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 9 Jan 2017 15:45:41 -0500 Subject: [PATCH 160/597] Add a mention to setup.py. --- doc/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/installation.rst b/doc/installation.rst index 3894c7cf12..a8a81111e3 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -133,7 +133,7 @@ MacPorts, but this is untested and unsupported. It appears that on some versions, /usr/local is not in the default compiler paths so you might need to add ``-L /usr/local/lib -I -/usr/local/include`` to the command line to build the python module. +/usr/local/include`` to the ``setup.py build`` command. Windows-specific instructions From 6d449061c3ce0369474e6b39cbe14dc4d070cab9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 12 Jan 2017 13:32:32 -0500 Subject: [PATCH 161/597] Switch the order of gs and ls to conform to what the underlying APIs use. --- src/CMakeLists.txt | 2 +- src/gpuarray/buffer.h | 4 ++-- src/gpuarray/config.h | 2 +- src/gpuarray/kernel.h | 8 ++++---- src/gpuarray_array.c | 4 ++-- src/gpuarray_blas_cuda_cublas.c | 12 ++++++------ src/gpuarray_buffer.c | 6 +++--- src/gpuarray_buffer_cuda.c | 8 ++++---- src/gpuarray_buffer_opencl.c | 6 +++--- src/gpuarray_elemwise.c | 8 ++++---- src/gpuarray_kernel.c | 6 +++--- src/gpuarray_reduction.c | 2 +- src/private.h | 2 +- 13 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 029148050d..02e32eccd4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version - VERSION 1.0 + VERSION 2.0 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index fb8970781a..d6d3dd8a09 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -482,15 +482,15 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a); * * \param k kernel * \param n number of dimensions of grid/block - * \param bs block sizes for this call (also known as local size) * \param gs grid sizes for this call (also known as global size) + * \param ls block sizes for this call (also known as local size) * \param shared amount of dynamic shared memory to reserve * \param args table of pointers to each argument (optional). * * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); /** diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index 4eb58b401d..f8fc86a01d 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -3,7 +3,7 @@ /* The following included file should have been generated by CMake. */ #include -#define GPUARRAY_API_VERSION 0 +#define GPUARRAY_API_VERSION 1 #ifdef GPUARRAY_SHARED #ifdef _WIN32 diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index 82d4f74edf..f88d74ffc6 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -87,24 +87,24 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val); * * \param k the kernel to schedule for * \param n number of elements to handle - * \param ls local size (in/out) * \param gs grid size (in/out) + * \param ls local size (in/out) */ GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n, - size_t *ls, size_t *gs); + size_t *gs, size_t *ls); /** * Launch the execution of a kernel. * * \param k the kernel to launch * \param n dimensionality of the grid/blocks - * \param ls sizes of launch blocks * \param gs sizes of launch grid + * \param ls sizes of launch blocks * \param amount of dynamic shared memory to allocate * \param args table of pointers to arguments */ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz, diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index eef077e6d5..434c641ae2 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -487,7 +487,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, if (err != GA_NO_ERROR) return err; - err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]); + err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]); if (err != GA_NO_ERROR) goto out; @@ -521,7 +521,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, GpuKernel_setarg(&k, argp++, &n[1]); GpuKernel_setarg(&k, argp++, errbuf); - err = GpuKernel_call(&k, 2, ls, gs, 0, NULL); + err = GpuKernel_call(&k, 2, gs, ls, 0, NULL); if (check_error && err == GA_NO_ERROR) { err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); if (err == GA_NO_ERROR && kerr != 0) { diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 9354d057a1..6d4648e232 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -1099,9 +1099,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA, args[8] = &N; if (transA == cb_no_trans) { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args); } else { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args); } cuda_ops.buffer_release(Aa); @@ -1223,9 +1223,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA, args[8] = &N; if (transA == cb_no_trans) { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args); } else { - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args); } cuda_ops.buffer_release(Aa); @@ -1486,7 +1486,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, args[8] = &M; args[9] = &N; - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); @@ -1618,7 +1618,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, args[8] = &M; args[9] = &N; - err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); + err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 14f792e453..a4dfd3329b 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) { return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a); } -int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls, - const size_t *gs, size_t shared, void **args) { - return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs, +int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, + const size_t *ls, size_t shared, void **args) { + return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls, shared, args); } diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 500eb147c1..120919c72a 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1237,7 +1237,7 @@ static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) { } static int cuda_callkernel(gpukernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) { cuda_context *ctx = k->ctx; unsigned int i; @@ -1258,15 +1258,15 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, switch (n) { case 1: - ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared, + ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared, ctx->s, args, NULL); break; case 2: - ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared, + ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared, ctx->s, args, NULL); break; case 3: - ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2], + ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2], shared, ctx->s, args, NULL); break; default: diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index f744084af3..7b8d684c7c 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -274,7 +274,7 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; } static void cl_releasekernel(gpukernel *k); static int cl_callkernel(gpukernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); static const char CL_PREAMBLE[] = @@ -748,7 +748,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { if (res != GA_NO_ERROR) goto fail; gs = ((n-1) / ls) + 1; args[0] = dst; - res = cl_callkernel(m, 1, &ls, &gs, 0, args); + res = cl_callkernel(m, 1, &gs, &ls, 0, args); fail: cl_releasekernel(m); @@ -998,7 +998,7 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) { } static int cl_callkernel(gpukernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) { cl_ctx *ctx = k->ctx; size_t _gs[3]; diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 1bb05bbb7e..f3ce7ee261 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -414,10 +414,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, } } - err = GpuKernel_sched(k, n, &ls, &gs); + err = GpuKernel_sched(k, n, &gs, &ls); if (err != GA_NO_ERROR) goto error; - err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL); + err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL); error: return err; } @@ -572,9 +572,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) { if (err != GA_NO_ERROR) return err; } } - err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs); + err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls); if (err != GA_NO_ERROR) return err; - return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL); + return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL); } GpuElemwise *GpuElemwise_new(gpucontext *ctx, diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c index 8beea94150..58311c86bb 100644 --- a/src/gpuarray_kernel.c +++ b/src/gpuarray_kernel.c @@ -32,7 +32,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) { return gpukernel_context(k->k); } -int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) { +int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) { size_t min_l; size_t max_l; size_t target_l; @@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) { } int GpuKernel_call(GpuKernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) { - return gpukernel_call(k->k, n, bs, gs, shared, args); + return gpukernel_call(k->k, n, gs, ls, shared, args); } int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) { diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 15391bad69..12eedb24a9 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -815,8 +815,8 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ ctx->dstArgmaxStepsGD){ ctx->ret = GpuKernel_call(&ctx->kernel, ctx->ndh>0 ? ctx->ndh : 1, - ctx->blockSize, ctx->gridSize, + ctx->blockSize, 0, args); }else{ diff --git a/src/private.h b/src/private.h index 57d919be88..0513df8605 100644 --- a/src/private.h +++ b/src/private.h @@ -97,7 +97,7 @@ struct _gpuarray_buffer_ops { void (*kernel_release)(gpukernel *k); int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a); int (*kernel_call)(gpukernel *k, unsigned int n, - const size_t *bs, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args); int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj); From 67cb9fccf4c6899d2d0d3194b7dc84ad302c0e00 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 12 Jan 2017 13:40:43 -0500 Subject: [PATCH 162/597] Adapt pygpu to the order change. --- pygpu/gpuarray.pxd | 10 +++++----- pygpu/gpuarray.pyx | 24 ++++++++++++------------ pygpu/reduction.py | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index c305d1dcfe..a4c83b0e2e 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -127,9 +127,9 @@ cdef extern from "gpuarray/kernel.h": unsigned int argcount, const int *types, int flags, char **err_str) void GpuKernel_clear(_GpuKernel *k) gpucontext *GpuKernel_context(_GpuKernel *k) - int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *ls, size_t *gs) + int GpuKernel_sched(_GpuKernel *k, size_t n, size_t *gs, size_t *ls) int GpuKernel_call(_GpuKernel *k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) int GpuKernel_binary(_GpuKernel *, size_t *, void **) @@ -265,9 +265,9 @@ cdef int kernel_init(GpuKernel k, gpucontext *ctx, int flags) except -1 cdef int kernel_clear(GpuKernel k) except -1 cdef gpucontext *kernel_context(GpuKernel k) except NULL -cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1 +cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1 cdef int kernel_call(GpuKernel k, unsigned int n, - const size_t *ls, const size_t *gs, + const size_t *gs, const size_t *ls, size_t shared, void **args) except -1 cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1 cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1 @@ -346,5 +346,5 @@ cdef api class GpuKernel [type PyGpuKernelType, object PyGpuKernelObject]: cdef void **callbuf cdef object __weakref__ - cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared) + cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared) cdef _setarg(self, unsigned int index, int typecode, object o) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index ef2fb818e7..ca19ed907a 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -446,16 +446,16 @@ cdef gpucontext *kernel_context(GpuKernel k) except NULL: raise GpuArrayException, "Invalid kernel or destroyed context" return res -cdef int kernel_sched(GpuKernel k, size_t n, size_t *ls, size_t *gs) except -1: +cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1: cdef int err - err = GpuKernel_sched(&k.k, n, ls, gs) + err = GpuKernel_sched(&k.k, n, gs, ls) if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) -cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *ls, - const size_t *gs, size_t shared, void **args) except -1: +cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs, + const size_t *ls, size_t shared, void **args) except -1: cdef int err - err = GpuKernel_call(&k.k, n, ls, gs, shared, args) + err = GpuKernel_call(&k.k, n, gs, ls, shared, args) if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) @@ -2105,10 +2105,10 @@ cdef class GpuKernel: sure to test against the size of your data. If you want more control over thread allocation you can use the - `ls` and `gs` parameters like so:: + `gs` and `ls` parameters like so:: k = GpuKernel(...) - k(param1, param2, ls=ls, gs=gs) + k(param1, param2, gs=gs, ls=ls) If you choose to use this interface, make sure to stay within the limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail. @@ -2192,12 +2192,12 @@ cdef class GpuKernel: finally: free(_types) - def __call__(self, *args, n=None, ls=None, gs=None, shared=0): + def __call__(self, *args, n=None, gs=None, ls=None, shared=0): if n == None and (ls == None or gs == None): raise ValueError, "Must specify size (n) or both gs and ls" - self.do_call(n, ls, gs, args, shared) + self.do_call(n, gs, ls, args, shared) - cdef do_call(self, py_n, py_ls, py_gs, py_args, size_t shared): + cdef do_call(self, py_n, py_gs, py_ls, py_args, size_t shared): cdef size_t n cdef size_t gs[3] cdef size_t ls[3] @@ -2264,8 +2264,8 @@ cdef class GpuKernel: if nd != 1: raise ValueError, "n is specified and nd != 1" n = py_n - kernel_sched(self, n, &ls[0], &gs[0]) - kernel_call(self, nd, ls, gs, shared, self.callbuf) + kernel_sched(self, n, &gs[0], &ls[0]) + kernel_call(self, nd, gs, ls, shared, self.callbuf) cdef _setarg(self, unsigned int index, int typecode, object o): if typecode == GA_BUFFER: diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 22f5a9c927..441380dbb5 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -277,7 +277,7 @@ def __call__(self, *args, **kwargs): kargs.append(offsets[i]) kargs.extend(strs[i]) - k(*kargs, ls=ls, gs=gs) + k(*kargs, gs=gs, ls=ls) return out From 57b4310c46c0fdb8aa9c15ff9aeaf48a8225b6ef Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 12 Jan 2017 14:28:34 -0500 Subject: [PATCH 163/597] Add proper conversion to bool. --- pygpu/gpuarray.pyx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index ef2fb818e7..9d75f97539 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1657,6 +1657,16 @@ cdef class GpuArray: """ return pygpu_as_ndarray(self) + def __bool__(self): + if self.ga.nd == 0: + return True + if self.ga.nd == 1: + if self.ga.dimensions[0] == 0: + return False + if self.ga.dimensions[0] == 1: + return bool(numpy.asarray(self)) + raise ValueError('The truth value of a multi-element array is ambiguous') + def _empty_like_me(self, dtype=None, order='C'): """ _empty_like_me(dtype=None, order='C') From 8dd3be8e68cf36d308e74c9ccf47d4774abc5ea4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 12 Jan 2017 17:07:06 -0500 Subject: [PATCH 164/597] Raise an error for float16 in ReductionKernel. --- pygpu/reduction.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 22f5a9c927..6a68194952 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -150,6 +150,11 @@ def __init__(self, context, dtype_out, neutral, reduce_expr, redux, else: self.arguments = arguments + if (self.dtype_out == np.dtype('float16') or + any(ar.dtype == numpy.dtype('float16') + for ar in self.arguments)): + raise UnsupportedError('float16 not supported for the reduction interface') + self.reduce_expr = reduce_expr if map_expr is None: if len(self.arguments) != 1: From 6572d18d4da42f816a47b6c47952389c2d707b90 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 13 Jan 2017 11:14:13 -0500 Subject: [PATCH 165/597] Fix typos/errors. --- pygpu/reduction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 6a68194952..695e23c813 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -150,10 +150,10 @@ def __init__(self, context, dtype_out, neutral, reduce_expr, redux, else: self.arguments = arguments - if (self.dtype_out == np.dtype('float16') or + if (self.dtype_out == numpy.dtype('float16') or any(ar.dtype == numpy.dtype('float16') for ar in self.arguments)): - raise UnsupportedError('float16 not supported for the reduction interface') + raise NotImplementedError('float16 not supported for the reduction interface') self.reduce_expr = reduce_expr if map_expr is None: From 612ae2f16c978be3a48156905c10c32bc632946b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 13 Jan 2017 11:19:06 -0500 Subject: [PATCH 166/597] Add a test for float16 data. --- pygpu/tests/test_reduction.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pygpu/tests/test_reduction.py b/pygpu/tests/test_reduction.py index e24cf9b11e..aad9d556ee 100644 --- a/pygpu/tests/test_reduction.py +++ b/pygpu/tests/test_reduction.py @@ -1,5 +1,7 @@ import numpy +from nose.tools import assert_raises + from pygpu import gpuarray, ndgpuarray as elemary from pygpu.reduction import ReductionKernel @@ -130,3 +132,9 @@ def test_reduction_0d(): rg = g.all() assert numpy.all(rc == numpy.asarray(rg)) + + +def test_reduction_f16(): + c, g = gen_gpuarray((3,), dtype='float16', ctx=context, cls=elemary) + + assert_raises(NotImplementedError, g.sum) From f1f6f1cb2051d0f9999557cbee075d388224d20e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 16 Jan 2017 11:31:11 -0500 Subject: [PATCH 167/597] =?UTF-8?q?Now=20with=20tests=E2=84=A2=20And=20bet?= =?UTF-8?q?ter=20code.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pygpu/gpuarray.pyx | 14 ++++++-------- pygpu/tests/test_gpu_ndarray.py | 5 +++++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 9d75f97539..425b332888 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1658,14 +1658,12 @@ cdef class GpuArray: return pygpu_as_ndarray(self) def __bool__(self): - if self.ga.nd == 0: - return True - if self.ga.nd == 1: - if self.ga.dimensions[0] == 0: - return False - if self.ga.dimensions[0] == 1: - return bool(numpy.asarray(self)) - raise ValueError('The truth value of a multi-element array is ambiguous') + if self.size == 0: + return False + elif self.size == 1: + return bool(numpy.asarray(self)) + else: + raise ValueError('The truth value of a multi-element array is ambiguous') def _empty_like_me(self, dtype=None, order='C'): """ diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index 34222b4e37..80b7ba0969 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -47,6 +47,11 @@ def test_hash(): assert exc is not None +def test_bool(): + for data in [numpy.empty((0, 33)), [[1]], [[0]], [], [0], [1], 0, 1]: + assert bool(pygpu.asarray(data)) == bool(numpy.asarray(data)) + + def test_transfer(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: From ebc7b17e35f1c58d75b2e76d776b66a63139ac42 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 16 Jan 2017 12:05:20 -0500 Subject: [PATCH 168/597] FIx the install of the headers (hopefully!). --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 54b391c71a..0895aa61e3 100755 --- a/setup.py +++ b/setup.py @@ -139,9 +139,10 @@ def __init__(self, *args, **kwargs): version=FULLVERSION, description='numpy-like wrapper on libgpuarray for GPU computations', packages=['pygpu', 'pygpu/tests'], - data_files=[('pygpu', ['pygpu/gpuarray.h', 'pygpu/gpuarray_api.h', - 'pygpu/blas_api.h', 'pygpu/numpy_compat.h', - 'pygpu/collectives.h', 'pygpu/collectives_api.h'])], + include_package_data=True, + package_data={'pygpu': ['gpuarray.h', 'gpuarray_api.h', + 'blas_api.h', 'numpy_compat.h', + 'collectives.h', 'collectives_api.h']}, ext_modules=cythonize(exts), install_requires=['mako>=0.7'], ) From 68b022256f87c6b27a165e2e5db1b489aef926d1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 16 Jan 2017 12:09:39 -0500 Subject: [PATCH 169/597] Fix the test. --- pygpu/tests/test_gpu_ndarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index 80b7ba0969..5853f94dd0 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -49,7 +49,7 @@ def test_hash(): def test_bool(): for data in [numpy.empty((0, 33)), [[1]], [[0]], [], [0], [1], 0, 1]: - assert bool(pygpu.asarray(data)) == bool(numpy.asarray(data)) + assert bool(pygpu.asarray(data, context=ctx)) == bool(numpy.asarray(data)) def test_transfer(): From 7c1b19819cf8a133f358a58e38333729080d9f30 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 16 Jan 2017 13:12:00 -0500 Subject: [PATCH 170/597] Bump the rc version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 54b391c71a..c6c8139c9f 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ MAJOR = 0 MINOR = 6 PATCH = 0 -SUFFIX = 'rc1' +SUFFIX = 'rc2' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: From af65953afaf7d70103332bd83421fba23bfa7dd3 Mon Sep 17 00:00:00 2001 From: Masahiro Wada Date: Tue, 24 Jan 2017 17:00:26 +0900 Subject: [PATCH 171/597] fix GA_DECL_SHARE_PARAM for apple opencl --- src/gpuarray_buffer_opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 7b8d684c7c..8e02efd3c3 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -316,7 +316,7 @@ static const char CL_PREAMBLE[] = "#define ga_ssize long\n" "#define load_half(p) vload_half(0, p)\n" "#define store_half(p, v) vstore_half_rtn(v, 0, p)\n" - "#define GA_DECL_SHARED_PARAM(type, name) , __local type name[]\n" + "#define GA_DECL_SHARED_PARAM(type, name) , __local type *name\n" "#define GA_DECL_SHARED_BODY(type, name)\n"; /* XXX: add complex types, quad types, and longlong */ From 1b646c369152297d8ebb91e0564bc557f568738f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 26 Jan 2017 16:20:29 -0500 Subject: [PATCH 172/597] Release 0.6.0 final --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index dd8a078690..7cd81a8983 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ MAJOR = 0 MINOR = 6 PATCH = 0 -SUFFIX = 'rc2' +SUFFIX = '' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: From 7a905a61ac117e9b95771e23210638e20bf1548c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 30 Jan 2017 14:01:32 -0500 Subject: [PATCH 173/597] Fix segfault in OpenCL when the library can't be loaded. --- src/gpuarray_buffer_opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 7b8d684c7c..af12c8dc66 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -432,7 +432,7 @@ static gpucontext *cl_init(int devno, int flags, int *ret) { e = setup_lib(); if (e != GA_NO_ERROR) - return NULL; + FAIL(NULL, e); err = clGetPlatformIDs(0, NULL, &nump); CHKFAIL(NULL); From 4ffe06f8dee71b73998be6e6180a7cd7ea079c6c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 2 Feb 2017 23:40:07 -0500 Subject: [PATCH 174/597] Add warning about the system python on recent mac os. --- doc/installation.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/installation.rst b/doc/installation.rst index a8a81111e3..4105c7d0d1 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -131,6 +131,10 @@ macOS. It might be possible to use a version of gcc built using Homebrew or MacPorts, but this is untested and unsupported. +If on OS X 10.11 or macOS 10.12 and later and using the system python, +you will have to use a virtualenv to use the python module. This is +due to a restriction of the new SIP feature about loading libraries. + It appears that on some versions, /usr/local is not in the default compiler paths so you might need to add ``-L /usr/local/lib -I /usr/local/include`` to the ``setup.py build`` command. From 484b165ddb8c5ced53ae4a2914d51ee8bbab48a2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 7 Feb 2017 10:42:22 -0500 Subject: [PATCH 175/597] Add the extra ldtype argument to __array__. --- pygpu/gpuarray.pxd | 2 +- pygpu/gpuarray.pyx | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index a4c83b0e2e..494cc91625 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -307,7 +307,7 @@ cdef api int pygpu_sync(GpuArray a) except -1 cdef api GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode) -cdef api np.ndarray pygpu_as_ndarray(GpuArray a) +cdef api np.ndarray pygpu_as_ndarray(GpuArray a, np.dtype ldtype) cdef api GpuArray pygpu_index(GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 76b38d5890..b0c2bc7375 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1352,15 +1352,18 @@ cdef GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode): a.ga.nd, a.ga.dimensions, ord) return res -cdef np.ndarray pygpu_as_ndarray(GpuArray a): +cdef np.ndarray pygpu_as_ndarray(GpuArray a, np.dtype ldtype): cdef np.ndarray res if not py_ISONESEGMENT(a): a = pygpu_copy(a, GA_ANY_ORDER) + if ldtype is None: + ldtype = a.dtype + res = PyArray_Empty(a.ga.nd, a.ga.dimensions, - a.dtype, (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and - not py_CHKFLAGS(a, GA_C_CONTIGUOUS))) + ldtype, (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and + not py_CHKFLAGS(a, GA_C_CONTIGUOUS))) array_read(np.PyArray_DATA(res), np.PyArray_NBYTES(res), a) @@ -1647,15 +1650,15 @@ cdef class GpuArray: res = (&h)[:sizeof(h)] return res - def __array__(self): + def __array__(self, ldtype=None): """ - __array__() + __array__(ldtype=None) Return a :class:`numpy.ndarray` with the same content. Automatically used by :meth:`numpy.asarray`. """ - return pygpu_as_ndarray(self) + return pygpu_as_ndarray(self, ldtype) def __bool__(self): if self.size == 0: From 380c839778ea0076d72e653ddecc2a8e7a6a0e84 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 7 Feb 2017 13:10:57 -0500 Subject: [PATCH 176/597] Add a test for the dtype argument. --- pygpu/tests/test_gpu_ndarray.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index 5853f94dd0..d98436af58 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -61,6 +61,8 @@ def test_transfer(): def transfer(shp, dtype, offseted): a, b = gen_gpuarray(shp, dtype, offseted, ctx=ctx) + # Test that passing dtype doesn't break. + c = numpy.asarray(b, dtype=dtype) c = numpy.asarray(b) assert numpy.allclose(c, a) From 6afe39321efffb82abf145ec4845d73240c418d3 Mon Sep 17 00:00:00 2001 From: slefrancois Date: Tue, 7 Feb 2017 16:36:45 -0500 Subject: [PATCH 177/597] update install doc --- doc/installation.rst | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 4105c7d0d1..a9e0bb5ffd 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -48,8 +48,8 @@ Download git clone https://github.com/Theano/libgpuarray.git cd libgpuarray -Step-by-step install --------------------- +Step-by-step install: system library (as admin) +----------------------------------------------- extract/clone the source to @@ -79,6 +79,18 @@ like this: python setup.py build_ext -L $MY_PREFIX/lib -I $MY_PREFIX/include +If installed globally under Linux (in /usr/local), you might have to run: + +.. code-block:: bash + + $ sudo ldconfig + +to make the linker know that there are new libraries available. You +can also reboot the machine to do that. + + +Step-by-step install: user library +---------------------------------- If you can not or do not want to install it for every user of that computer, you can install them in your home directory like this: @@ -91,13 +103,11 @@ computer, you can install them in your home directory like this: cmake .. -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_BUILD_TYPE=Release make make install - make test + DEVICE="" make test cd .. # Run the following export and add them in your ~/.bashrc file - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.local/lib64/ - export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib64/ export CPATH=$CPATH:~/.local/include export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.local/lib @@ -105,21 +115,9 @@ computer, you can install them in your home directory like this: python setup.py build python setup.py install --user cd - python -c "import pygpu;pygpu.test()" - - -Linux-specific instructions ---------------------------- - -If installed globally (in /usr/local), you might have to run: - -.. code-block:: bash - - $ sudo ldconfig - -to make the linker know that there are new libraries available. You -can also reboot the machine to do that. + DEVICE="" python -c "import pygpu;pygpu.test()" +Change ``DEVICE=""`` to the GPU device you want to use for testing. Mac-specific instructions ------------------------- From 331af03d02714f290a3b0f9898c55d631a8bb209 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 7 Feb 2017 18:10:48 -0500 Subject: [PATCH 178/597] Make sure not to change the public cython C api. --- pygpu/gpuarray.pxd | 3 ++- pygpu/gpuarray.pyx | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 494cc91625..2db9e4c270 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -307,7 +307,8 @@ cdef api int pygpu_sync(GpuArray a) except -1 cdef api GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode) -cdef api np.ndarray pygpu_as_ndarray(GpuArray a, np.dtype ldtype) +cdef api np.ndarray pygpu_as_ndarray(GpuArray a) +cdef np.ndarray _pygpu_as_ndarray(GpuArray a, np.dtype ldtype) cdef api GpuArray pygpu_index(GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index b0c2bc7375..a131953a49 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1352,7 +1352,10 @@ cdef GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode): a.ga.nd, a.ga.dimensions, ord) return res -cdef np.ndarray pygpu_as_ndarray(GpuArray a, np.dtype ldtype): +cdef np.ndarray pygpu_as_ndarray(GpuArray a): + return _pygpu_as_ndarray(a, None) + +cdef np.ndarray _pygpu_as_ndarray(GpuArray a, np.dtype ldtype): cdef np.ndarray res if not py_ISONESEGMENT(a): @@ -1658,7 +1661,7 @@ cdef class GpuArray: Automatically used by :meth:`numpy.asarray`. """ - return pygpu_as_ndarray(self, ldtype) + return _pygpu_as_ndarray(self, ldtype) def __bool__(self): if self.size == 0: From af357215cc0b266ba6b2130f1eaa36203ba56834 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 10 Feb 2017 16:50:04 -0500 Subject: [PATCH 179/597] Stop 'make rel' and 'make debug' always rebuilding everything. --- Makefile | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 6d9ebdd85c..e0cb7d3bf8 100644 --- a/Makefile +++ b/Makefile @@ -11,10 +11,8 @@ debug: install-debugc py .PHONY: install-debugc py debug install-relc rel config -Debug: - mkdir Debug - -Debug/Makefile: Debug config +Debug/Makefile: Debug Makefile.conf + mkdir -p Debug ifndef INSTALL_PREFIX (cd Debug && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Debug) else @@ -34,10 +32,8 @@ endif install-debugc: debugc (cd Debug && ${SUDO} make install) -Release: - mkdir Release - -Release/Makefile: Release config +Release/Makefile: Makefile.conf + mkdir -p Release ifndef INSTALL_PREFIX (cd Release && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Release) else @@ -57,5 +53,5 @@ endif install-relc: relc (cd Release && ${SUDO} make install) -py: config +py: Makefile.conf python setup.py build_ext --inplace From 0abade96e33469540b6cdb42e875565d73950702 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 10 Feb 2017 16:58:36 -0500 Subject: [PATCH 180/597] Add a function to get the library version and use it to blacklist bad drivers. --- src/loaders/dyn_load.c | 78 +++++++++++++++++++++++++++++++++++++++++- src/loaders/dyn_load.h | 1 + src/loaders/libcuda.c | 12 ++++++- 3 files changed, 89 insertions(+), 2 deletions(-) diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index 3586b72725..871b972c10 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -3,8 +3,10 @@ #if defined(__unix__) || defined(__APPLE__) #include -#include #include +#include +#include +#include void *ga_load_library(const char *name) { void *res = dlopen(name, RTLD_LAZY|RTLD_LOCAL); @@ -24,6 +26,45 @@ void *ga_func_ptr(void *h, const char *name) { return res; } +float ga_lib_version(void *h, void *sym) { + Dl_info dli; + char *real_path; + char *dot1; + char *dot2; + char *end; + float res; + + if (!dladdr(sym, &dli)) + return -1; + + real_path = realpath(dli.dli_fname,NULL); + if (real_path == NULL) + return -1; + + dot1 = strrchr(real_path, '.'); + if (dot1 == real_path) { + free(real_path); + return -1; + } + dot1[0] = '\0'; + + dot2 = strrchr(real_path, '.'); + if (dot2 == real_path) { + free(real_path); + return -1; + } + dot1[0] = '.'; + + res = strtof(dot2+1, &end); + if (*end != '\0') { + free(real_path); + return -1; + } + + free(real_path); + return res; +} + #else /* Should be windows */ @@ -37,4 +78,39 @@ void *ga_func_ptr(void *h, const char *name) { return (void *)GetProcAddress(h, name); } +float ga_lib_version(void *h, void *sym) { + char fname[1024]; + char *vinfo; + size_t vsize; + VS_FIXEDFILEINFO *vp; + unsigned int ui; + float res; + + if (GetModuleFileName(h, fname, sizeof(fname)) == sizeof(fname)) + return -1; + + vsize = GetFileVersionInfoSize(fname, NULL); + if (vsize == 0) + return -1; + + vinfo = malloc(vsize); + if (vinfo == NULL) + return -1; + + if (!GetFileVersionInfo(fname, 0, vsize, vinfo)) { + free(vinfo); + return -1; + } + + if (!VerQueryValue(vinfo, "\\", &vp, &ui)) { + free(vinfo); + return -1; + } + + res = HIWORD(vp->dwFileVersionMS) + (LOWORD(vp->dwFileVersionMS) / 100.0); + + free(vinfo); + return res; +} + #endif diff --git a/src/loaders/dyn_load.h b/src/loaders/dyn_load.h index 73fea5d69f..37753629c0 100644 --- a/src/loaders/dyn_load.h +++ b/src/loaders/dyn_load.h @@ -3,5 +3,6 @@ void *ga_load_library(const char *name); void *ga_func_ptr(void *h, const char *name); +float ga_lib_version(void *h, void *sym); #endif diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 448791a678..147513f2bd 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -1,4 +1,4 @@ -#include +#include #include "libcuda.h" #include "dyn_load.h" @@ -41,6 +41,7 @@ static int loaded = 0; int load_libcuda(void) { void *lib; + float v; if (loaded) return GA_NO_ERROR; @@ -51,6 +52,15 @@ int load_libcuda(void) { #include "libcuda.fn" + v = ga_lib_version(lib, cuInit); + if (v == -1) + fprintf(stderr, "WARNING: could not determine cuda driver version. Some versions return bad results, make sure your version is fine\n"); + + if (v > 373.06) { + fprintf(stderr, "ERROR: refusing to load cuda driver library because the version is blacklisted\n"); + return GA_LOAD_ERROR; + } + loaded = 1; return GA_NO_ERROR; } From b1d74198eb5a1447c265d1890dcb7ccc55e88470 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 13 Feb 2017 11:12:12 -0500 Subject: [PATCH 181/597] Fix the blas loading in OpenCL. --- src/gpuarray_buffer_opencl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 756bfb3730..adf34a3825 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1245,10 +1245,14 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_BLAS_OPS: { int e; - if ((e = load_libclblas()) == GA_NO_ERROR) + if ((e = load_libclblas()) == GA_NO_ERROR) { *((gpuarray_blas_ops **)res) = &clblas_ops; - if ((e = load_libclblast()) == GA_NO_ERROR) + return e; + } + if ((e = load_libclblast()) == GA_NO_ERROR) { *((gpuarray_blas_ops **)res) = &clblast_ops; + return e; + } return e; } From a0d103570dad2896d12cdd5a01467cb0c6424594 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 13 Feb 2017 11:47:04 -0500 Subject: [PATCH 182/597] Add a way to bypass the error and be explict about working versions. --- src/loaders/libcuda.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 147513f2bd..4e398fd362 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -56,9 +56,17 @@ int load_libcuda(void) { if (v == -1) fprintf(stderr, "WARNING: could not determine cuda driver version. Some versions return bad results, make sure your version is fine\n"); - if (v > 373.06) { - fprintf(stderr, "ERROR: refusing to load cuda driver library because the version is blacklisted\n"); - return GA_LOAD_ERROR; + if (v > 373.06) + if (getenv("GPUARRAY_FORCE_CUDA_DRIVER_LOAD") != NULL) { + fprintf(stderr, "WARNING: loading blacklisted driver because the load was forced.\n"); + } else { + fprintf(stderr, "ERROR: refusing to load cuda driver library " + "because the version is blacklisted. " + "Versions below 373.06 are known to be ok.\n" + "If you want to bypass this check and force the driver load " + "define GPUARRAY_FORCE_CUDA_DRIVER_LOAD in your environement.\n"); + return GA_LOAD_ERROR; + } } loaded = 1; From 12ceff2f8f9f6df096ebeb770b63b1f3f8564a35 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 13 Feb 2017 13:42:42 -0500 Subject: [PATCH 183/597] Fix code. --- src/loaders/libcuda.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 4e398fd362..efe6de951d 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -1,4 +1,5 @@ #include +#include #include "libcuda.h" #include "dyn_load.h" @@ -56,7 +57,7 @@ int load_libcuda(void) { if (v == -1) fprintf(stderr, "WARNING: could not determine cuda driver version. Some versions return bad results, make sure your version is fine\n"); - if (v > 373.06) + if (v > 373.06) { if (getenv("GPUARRAY_FORCE_CUDA_DRIVER_LOAD") != NULL) { fprintf(stderr, "WARNING: loading blacklisted driver because the load was forced.\n"); } else { From 64ad544f5fe933cae3f9487664eeec7cfb5fc6c6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 13 Feb 2017 14:19:59 -0500 Subject: [PATCH 184/597] Fix the message. --- src/loaders/libcuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index efe6de951d..5109957624 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -63,7 +63,7 @@ int load_libcuda(void) { } else { fprintf(stderr, "ERROR: refusing to load cuda driver library " "because the version is blacklisted. " - "Versions below 373.06 are known to be ok.\n" + "Versions 373.06 and below are known to be ok.\n" "If you want to bypass this check and force the driver load " "define GPUARRAY_FORCE_CUDA_DRIVER_LOAD in your environement.\n"); return GA_LOAD_ERROR; From 85e2a503c4c357316656956fba0ef3dc154d78d6 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 13 Feb 2017 18:33:30 -0500 Subject: [PATCH 185/597] Detect driver version on Windows. (#3) --- src/loaders/dyn_load.c | 3 ++- src/loaders/libcuda.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index 871b972c10..bd53ff5259 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -69,6 +69,7 @@ float ga_lib_version(void *h, void *sym) { /* Should be windows */ #include +#pragma comment(lib,"Version.lib") void *ga_load_library(const char *name) { return LoadLibrary(name); @@ -107,7 +108,7 @@ float ga_lib_version(void *h, void *sym) { return -1; } - res = HIWORD(vp->dwFileVersionMS) + (LOWORD(vp->dwFileVersionMS) / 100.0); + res = ( ((HIWORD(vp->dwFileVersionLS) - 10) * 10000) + LOWORD(vp->dwFileVersionLS) ) / 100.0; free(vinfo); return res; diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 5109957624..c9e587e065 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -56,6 +56,9 @@ int load_libcuda(void) { v = ga_lib_version(lib, cuInit); if (v == -1) fprintf(stderr, "WARNING: could not determine cuda driver version. Some versions return bad results, make sure your version is fine\n"); + #ifdef DEBUG + fprintf(stderr, "CUDA driver version detected: %.2f\n", v); + #endif if (v > 373.06) { if (getenv("GPUARRAY_FORCE_CUDA_DRIVER_LOAD") != NULL) { From 309e87b184537726c3c08ddd1fd736a07895c1a2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 13 Feb 2017 18:37:57 -0500 Subject: [PATCH 186/597] Fixes for macOS (we don't blacklist there). --- src/loaders/dyn_load.c | 6 +++--- src/loaders/libcuda.c | 9 ++++++++- src/loaders/libopencl.c | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index bd53ff5259..64bd28ff48 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -37,19 +37,19 @@ float ga_lib_version(void *h, void *sym) { if (!dladdr(sym, &dli)) return -1; - real_path = realpath(dli.dli_fname,NULL); + real_path = realpath(dli.dli_fname, NULL); if (real_path == NULL) return -1; dot1 = strrchr(real_path, '.'); - if (dot1 == real_path) { + if (dot1 == NULL) { free(real_path); return -1; } dot1[0] = '\0'; dot2 = strrchr(real_path, '.'); - if (dot2 == real_path) { + if (dot2 == NULL) { free(real_path); return -1; } diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index c9e587e065..d1532fe2b2 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -10,7 +10,7 @@ static char libname[] = "nvcuda.dll"; #else /* Unix */ #ifdef __APPLE__ -static char libname[] = "CUDA.framework/CUDA"; +static char libname[] = "/Library/Frameworks/CUDA.framework/CUDA"; #else static char libname[] = "libcuda.so"; #endif @@ -42,7 +42,9 @@ static int loaded = 0; int load_libcuda(void) { void *lib; +#ifndef __APPLE__ float v; +#endif if (loaded) return GA_NO_ERROR; @@ -53,6 +55,10 @@ int load_libcuda(void) { #include "libcuda.fn" +/* + * The blacklisted versions of cuda are not available on mac as far as I know. + */ +#ifndef __APPLE__ v = ga_lib_version(lib, cuInit); if (v == -1) fprintf(stderr, "WARNING: could not determine cuda driver version. Some versions return bad results, make sure your version is fine\n"); @@ -72,6 +78,7 @@ int load_libcuda(void) { return GA_LOAD_ERROR; } } +#endif loaded = 1; return GA_NO_ERROR; diff --git a/src/loaders/libopencl.c b/src/loaders/libopencl.c index c3e11d3c0c..1994fc38aa 100644 --- a/src/loaders/libopencl.c +++ b/src/loaders/libopencl.c @@ -8,7 +8,7 @@ static char libname[] = "OpenCL.dll"; #else /* Unix */ #ifdef __APPLE__ -static char libname[] = "OpenCL.framework/OpenCL"; +static char libname[] = "/System/Library/Frameworks/OpenCL.framework/OpenCL"; #else static char libname[] = "libOpenCL.so"; #endif From e68d4957c30c60591d74aaf2a71c82f58718f336 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Feb 2017 13:37:05 -0500 Subject: [PATCH 187/597] Change the definition of the contiguous flags to ignore strides on dimensions of size 1. --- src/gpuarray_array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 434c641ae2..c18e5d42cc 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -1118,7 +1118,7 @@ int GpuArray_is_c_contiguous(const GpuArray *a) { int i; for (i = a->nd - 1; i >= 0; i--) { - if (a->strides[i] != (ssize_t)size) return 0; + if (a->strides[i] != (ssize_t)size && a->dimensions[i] != 1) return 0; // We suppose that overflow will not happen since data has to fit in memory size *= a->dimensions[i]; } @@ -1130,7 +1130,7 @@ int GpuArray_is_f_contiguous(const GpuArray *a) { unsigned int i; for (i = 0; i < a->nd; i++) { - if (a->strides[i] != (ssize_t)size) return 0; + if (a->strides[i] != (ssize_t)size && a->dimensions[i] != 1) return 0; // We suppose that overflow will not happen since data has to fit in memory size *= a->dimensions[i]; } From 80a7734d501a1d94b5bc085268f407e86870f6c3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Feb 2017 14:10:47 -0500 Subject: [PATCH 188/597] Version change for 0.6.1rc1 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7cd81a8983..ca79f381ac 100755 --- a/setup.py +++ b/setup.py @@ -5,8 +5,8 @@ MAJOR = 0 MINOR = 6 -PATCH = 0 -SUFFIX = '' +PATCH = 1 +SUFFIX = 'rc1' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: From f6649ca13a29e017e9aa1f54b086bca29de5c80c Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Mon, 20 Feb 2017 10:46:51 +0100 Subject: [PATCH 189/597] ENH: add INFINITY to CUDA kernel preamble --- src/gpuarray_buffer_cuda.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 120919c72a..447400f277 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -318,6 +318,10 @@ static const char CUDA_PREAMBLE[] = "#undef NAN\n" "#endif\n" "#define NAN __int_as_float(0x7fffffff)\n" + "#ifdef INFINITY\n" + "#undef INFINITY\n" + "#endif\n" + "#define INFINITY __int_as_float(0x7f800000)\n" "#define LID_0 threadIdx.x\n" "#define LID_1 threadIdx.y\n" "#define LID_2 threadIdx.z\n" From 483cc876aa9f8edadb3b651e64f10b8c2f2ff1ea Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Mon, 20 Feb 2017 12:04:15 +0100 Subject: [PATCH 190/597] TST: add test for elemwise with infinity --- pygpu/tests/test_elemwise.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pygpu/tests/test_elemwise.py b/pygpu/tests/test_elemwise.py index 89b59b14f6..ecec66b2ef 100644 --- a/pygpu/tests/test_elemwise.py +++ b/pygpu/tests/test_elemwise.py @@ -1,5 +1,6 @@ import operator import numpy +from mako.template import Template from unittest import TestCase from pygpu import gpuarray, ndgpuarray as elemary @@ -297,3 +298,37 @@ def broadcast(shapea, shapeb): rg = ag + bg check_meta_content(rg, rc) + + +_inf_preamb_tpl = Template(''' +WITHIN_KERNEL ${flt} +infinity() {return INFINITY;} + +WITHIN_KERNEL ${flt} +neg_infinity() {return -INFINITY;} +''') + + +def test_infinity(): + for dtype in ['float32', 'float64']: + ac, ag = gen_gpuarray((2,), dtype, ctx=context, cls=elemary) + out_g = ag._empty_like_me() + flt = 'ga_float' if dtype == 'float32' else 'ga_double' + out_arg = arg('out', out_g.dtype, scalar=False, read=False, write=True) + preamble = _inf_preamb_tpl.render(flt=flt) + + # +infinity + ac[:] = numpy.inf + expr_inf = 'out = infinity()' + kernel = GpuElemwise(context, expr_inf, [out_arg], + preamble=preamble) + kernel(out_g) + assert numpy.array_equal(ac, numpy.asarray(out_g)) + + # -infinity + ac[:] = -numpy.inf + expr_neginf = 'out = neg_infinity()' + kernel = GpuElemwise(context, expr_neginf, [out_arg], + preamble=preamble) + kernel(out_g) + assert numpy.array_equal(ac, numpy.asarray(out_g)) From 78afeb8b2193a58922080a7b26b73e94e94e213b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 20 Feb 2017 10:43:00 -0500 Subject: [PATCH 191/597] Version change for 0.6.1 --- doc/conf.py | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 9b5da342ef..8fb2e2250f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -49,9 +49,9 @@ # built documents. # # The short X.Y version. -version = '0.2' +version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.2' +release = '0.6.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index ca79f381ac..ab0c373610 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ MAJOR = 0 MINOR = 6 PATCH = 1 -SUFFIX = 'rc1' +SUFFIX = '' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: From 4d108bb0158d9eddac2e9e33d1bcd7d44163f5bf Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Wed, 1 Mar 2017 21:48:35 +0100 Subject: [PATCH 192/597] ENH: add indexing with None for new axes --- pygpu/gpuarray.pyx | 105 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index a131953a49..8190016d12 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1836,7 +1836,83 @@ cdef class GpuArray: raise TypeError, "len() of unsized object" def __getitem__(self, key): - return self.__cgetitem__(key) + cdef unsigned int i + + if key is Ellipsis: + return self.__cgetitem__(key) + + # A list or a sequence of list should trigger "fancy" indexing. + # This is not implemented yet. + # Conversely, if a list contains slice or Ellipsis objects, it behaves + # the same as a tuple. + if isinstance(key, list): + if any(isinstance(k, slice) or k is Ellipsis for k in key): + return self.__getitem__(tuple(key)) + else: + raise NotImplementedError, "fancy indexing not supported" + + try: + iter(key) + except TypeError: + key = (key,) + else: + if all(isinstance(k, list) for k in key): + raise NotImplementedError, "fancy indexing not supported" + + key = tuple(key) + + # Need to massage Ellipsis here, to avoid packing it into a tuple. + if key.count(Ellipsis) > 1: + raise IndexError, "cannot use more than one Ellipsis" + + # The following code replaces an Ellipsis found in the key by + # the corresponding number of slice(None) objects, depending on the + # number of dimensions. As example, this allows indexing on the last + # dimension with a[..., 1:] on any array (including 1-dim). This + # is also required for numpy compat. + try: + ell_idx = key.index(Ellipsis) + except ValueError: + pass + else: + # Need number of axes minus missing dimensions extra slice(None) + # objects, not counting None entries and the Ellipsis itself + num_slcs = self.ga.nd - (len(key) - key.count(None) - 1) + fill_slices = (slice(None),) * num_slcs + key = key[:ell_idx] + fill_slices + key[ell_idx + 1:] + + # Remove the None entries for indexing + getitem_idcs = tuple(k for k in key if k is not None) + + # For less than 1 index, fill up with slice(None) to the right. + # This allows indexing a[1:] in multi-dimensional arrays, where the + # slice is applied along the first axis only. It also allows + # a[()], which simply is a view in Numpy. + if len(getitem_idcs) <= 1: + getitem_idcs = (getitem_idcs + + (slice(None),) * (self.ga.nd - len(getitem_idcs))) + + # Slice into array, then reshape, accommodating for None entries in key + sliced = self.__cgetitem__(getitem_idcs) + if key.count(None) == 0: + # Avoid unnecessary reshaping if there was no None + return sliced + else: + new_shape = [] + i = 0 + if sliced.shape: + for k in key: + if isinstance(k, int): + continue + elif k is None: + new_shape.append(1) + else: + new_shape.append(sliced.shape[i]) + i += 1 + # Add remaining entries from sliced.shape if existing (happens + # for 1 index or less if ndim >= 2). + new_shape.extend(sliced.shape[i:]) + return sliced.reshape(new_shape) cdef __cgetitem__(self, key): cdef ssize_t *starts @@ -1896,16 +1972,37 @@ cdef class GpuArray: steps[i] = 1 return pygpu_index(self, starts, stops, steps) + finally: free(starts) free(stops) free(steps) def __setitem__(self, idx, v): - cdef GpuArray tmp = self.__cgetitem__(idx) - cdef GpuArray gv = carray(v, self.ga.typecode, False, 'A', 0, - self.context, GpuArray) + cdef GpuArray tmp, gv + + if isinstance(idx, list): + if any(isinstance(i, slice) or i is Ellipsis for i in idx): + self.__setitem__(tuple(idx), v) + else: + raise NotImplementedError, "fancy indexing not supported" + try: + iter(idx) + except TypeError: + idx = (idx,) + else: + if all(isinstance(i, list) for i in idx): + raise NotImplementedError, "fancy indexing not supported" + + idx = tuple(idx) + + if idx.count(Ellipsis) > 1: + raise IndexError, "cannot use more than one Ellipsis" + # Remove None entries, they should be ignored (as in Numpy) + idx = tuple(i for i in idx if i is not None) + tmp = self.__cgetitem__(idx) + gv = carray(v, self.ga.typecode, False, 'A', 0, self.context, GpuArray) array_setarray(tmp, gv) def take1(self, GpuArray idx): From 9f0fa41b73680ebeaa9e33b52ee3b1910c68cd22 Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Thu, 2 Mar 2017 00:54:51 +0100 Subject: [PATCH 193/597] TST: add test for slicing with None for new axis --- pygpu/tests/test_gpu_ndarray.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index d98436af58..d5c2c97bf6 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -273,7 +273,7 @@ def test_empty_no_params(): def test_mapping_getitem_ellipsis(): - for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: + for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: yield mapping_getitem_ellipsis, shp, dtype, offseted @@ -289,6 +289,27 @@ def mapping_getitem_ellipsis(shp, dtype, offseted): assert numpy.allclose(a, b_cpu) +def test_getitem_none(): + for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: + yield getitem_none, shp + + +def getitem_none(shp): + a, a_gpu = gen_gpuarray(shp, ctx=ctx) + + assert numpy.allclose(a_gpu[..., None], a[..., None]) + + for _ in range(5): + # Choose something to slice with, always works + indcs = tuple(numpy.random.choice([0, slice(None), slice(1, None)], + size=len(shp))) + indcs = indcs[:1] + (None,) + indcs[1:] + assert numpy.allclose(a_gpu[indcs], a[indcs]) + + if shp: + assert numpy.allclose(a_gpu[1:, None], a[1:, None]) + + def test_mapping_setitem_ellipsis(): for shp in [(9,), (8, 9), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: From 215f3f39844c4bcd16494736de188356b8477aa3 Mon Sep 17 00:00:00 2001 From: Holger Kohr Date: Thu, 2 Mar 2017 18:21:12 +0100 Subject: [PATCH 194/597] TST: add test for setitem in axis 0 --- pygpu/tests/test_gpu_ndarray.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index d5c2c97bf6..e0f362403e 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -310,12 +310,14 @@ def getitem_none(shp): assert numpy.allclose(a_gpu[1:, None], a[1:, None]) -def test_mapping_setitem_ellipsis(): +def test_mapping_setitem(): for shp in [(9,), (8, 9), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: yield mapping_setitem_ellipsis, shp, dtype, offseted yield mapping_setitem_ellipsis2, shp, dtype, offseted + yield mapping_setitem_firstaxis, shp, dtype, offseted + @guard_devsup def mapping_setitem_ellipsis(shp, dtype, offseted): @@ -324,13 +326,24 @@ def mapping_setitem_ellipsis(shp, dtype, offseted): a_gpu[...] = 2 assert numpy.allclose(a, numpy.asarray(a_gpu)) + @guard_devsup def mapping_setitem_ellipsis2(shp, dtype, offseted): a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) b, b_gpu = gen_gpuarray(shp[1:], dtype, False, ctx=ctx) a[:] = b a_gpu[:] = b_gpu - assert numpy.allclose(a, numpy.asarray(b_gpu)) + assert numpy.allclose(a, numpy.asarray(a_gpu)) + + +@guard_devsup +def mapping_setitem_firstaxis(shp, dtype, offseted): + a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) + b, b_gpu = gen_gpuarray(shp[1:], dtype, False, ctx=ctx) + a[0] = b + a_gpu[0] = b_gpu + assert numpy.allclose(a, numpy.asarray(a_gpu)) + class WriteReadTest(unittest.TestCase): def setUp(self): From e1bed341b76d54e05954c714f582def6bf7e6584 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 6 Mar 2017 11:00:14 -0500 Subject: [PATCH 195/597] Disable the blacklist of driver as Theano master now work with more recent driver. --- src/loaders/libcuda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index d1532fe2b2..293bc36bd4 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -56,9 +56,9 @@ int load_libcuda(void) { #include "libcuda.fn" /* - * The blacklisted versions of cuda are not available on mac as far as I know. + * We keep this in case we need again blacklist in the futur. */ -#ifndef __APPLE__ +#if 0 v = ga_lib_version(lib, cuInit); if (v == -1) fprintf(stderr, "WARNING: could not determine cuda driver version. Some versions return bad results, make sure your version is fine\n"); From 23c8f004968f2d7703612f2ebb0e2da1b112bc7d Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 6 Mar 2017 15:13:05 -0500 Subject: [PATCH 196/597] Version change for 0.6.2 --- doc/conf.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 8fb2e2250f..0b658a26dc 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -51,7 +51,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.1' +release = '0.6.2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index ab0c373610..0084e04b4a 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ MAJOR = 0 MINOR = 6 -PATCH = 1 +PATCH = 2 SUFFIX = '' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) From 4e9f6ae287ed163a9b313bccb35a5e8e61ab17e2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 13 Mar 2017 17:59:22 -0400 Subject: [PATCH 197/597] Remove the blacklist code since it's horrible and I really hope we never need to do that again. We can always look at version history if it's really required. --- src/loaders/dyn_load.c | 74 ------------------------------------------ src/loaders/dyn_load.h | 1 - src/loaders/libcuda.c | 28 ---------------- 3 files changed, 103 deletions(-) diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index 64bd28ff48..a532a47422 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -26,45 +26,6 @@ void *ga_func_ptr(void *h, const char *name) { return res; } -float ga_lib_version(void *h, void *sym) { - Dl_info dli; - char *real_path; - char *dot1; - char *dot2; - char *end; - float res; - - if (!dladdr(sym, &dli)) - return -1; - - real_path = realpath(dli.dli_fname, NULL); - if (real_path == NULL) - return -1; - - dot1 = strrchr(real_path, '.'); - if (dot1 == NULL) { - free(real_path); - return -1; - } - dot1[0] = '\0'; - - dot2 = strrchr(real_path, '.'); - if (dot2 == NULL) { - free(real_path); - return -1; - } - dot1[0] = '.'; - - res = strtof(dot2+1, &end); - if (*end != '\0') { - free(real_path); - return -1; - } - - free(real_path); - return res; -} - #else /* Should be windows */ @@ -79,39 +40,4 @@ void *ga_func_ptr(void *h, const char *name) { return (void *)GetProcAddress(h, name); } -float ga_lib_version(void *h, void *sym) { - char fname[1024]; - char *vinfo; - size_t vsize; - VS_FIXEDFILEINFO *vp; - unsigned int ui; - float res; - - if (GetModuleFileName(h, fname, sizeof(fname)) == sizeof(fname)) - return -1; - - vsize = GetFileVersionInfoSize(fname, NULL); - if (vsize == 0) - return -1; - - vinfo = malloc(vsize); - if (vinfo == NULL) - return -1; - - if (!GetFileVersionInfo(fname, 0, vsize, vinfo)) { - free(vinfo); - return -1; - } - - if (!VerQueryValue(vinfo, "\\", &vp, &ui)) { - free(vinfo); - return -1; - } - - res = ( ((HIWORD(vp->dwFileVersionLS) - 10) * 10000) + LOWORD(vp->dwFileVersionLS) ) / 100.0; - - free(vinfo); - return res; -} - #endif diff --git a/src/loaders/dyn_load.h b/src/loaders/dyn_load.h index 37753629c0..73fea5d69f 100644 --- a/src/loaders/dyn_load.h +++ b/src/loaders/dyn_load.h @@ -3,6 +3,5 @@ void *ga_load_library(const char *name); void *ga_func_ptr(void *h, const char *name); -float ga_lib_version(void *h, void *sym); #endif diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 293bc36bd4..47d0806e0a 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -42,9 +42,6 @@ static int loaded = 0; int load_libcuda(void) { void *lib; -#ifndef __APPLE__ - float v; -#endif if (loaded) return GA_NO_ERROR; @@ -55,31 +52,6 @@ int load_libcuda(void) { #include "libcuda.fn" -/* - * We keep this in case we need again blacklist in the futur. - */ -#if 0 - v = ga_lib_version(lib, cuInit); - if (v == -1) - fprintf(stderr, "WARNING: could not determine cuda driver version. Some versions return bad results, make sure your version is fine\n"); - #ifdef DEBUG - fprintf(stderr, "CUDA driver version detected: %.2f\n", v); - #endif - - if (v > 373.06) { - if (getenv("GPUARRAY_FORCE_CUDA_DRIVER_LOAD") != NULL) { - fprintf(stderr, "WARNING: loading blacklisted driver because the load was forced.\n"); - } else { - fprintf(stderr, "ERROR: refusing to load cuda driver library " - "because the version is blacklisted. " - "Versions 373.06 and below are known to be ok.\n" - "If you want to bypass this check and force the driver load " - "define GPUARRAY_FORCE_CUDA_DRIVER_LOAD in your environement.\n"); - return GA_LOAD_ERROR; - } - } -#endif - loaded = 1; return GA_NO_ERROR; } From 915c795947efd0810d63ec46925e7f6709adde6d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 15 Mar 2017 12:00:53 -0400 Subject: [PATCH 198/597] Add missing GLOBAL_MEM --- src/gpuarray_array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index c18e5d42cc..91c043a1d6 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -379,8 +379,8 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); - strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((char *)r) + r_off);\n" - " ind = (GLOBAL_MEM %s *)(((char *)ind) + i_off);\n", + strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n" + " ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" From 4750937998827e9ad4e947fde28fa4d61f66989c Mon Sep 17 00:00:00 2001 From: Thomas George Date: Mon, 20 Mar 2017 17:13:01 -0400 Subject: [PATCH 199/597] triu/tril functions --- pygpu/array_tools.py | 47 ++++++++++++++++++++++++++++++++ pygpu/tests/tests_array_tools.py | 45 ++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 pygpu/array_tools.py create mode 100644 pygpu/tests/tests_array_tools.py diff --git a/pygpu/array_tools.py b/pygpu/array_tools.py new file mode 100644 index 0000000000..f7972cdfdc --- /dev/null +++ b/pygpu/array_tools.py @@ -0,0 +1,47 @@ +from string import Template +from .gpuarray import GpuArray, GpuKernel + + +def _generate_kernel(ctx, cols, upper=True): + tmpl = Template(""" + KERNEL void extract_tri(ga_float *a, ga_uint N) { + unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ + blockIdx.x*blockDim.x+threadIdx.x; + unsigned int ix = idx/${cols}; + unsigned int iy = idx%${cols}; + if (idx < N) { + if (ix ${le} iy) + a[idx] = 0.0; + } + } + """) + if upper: + le = '>' + else: + le = '<' + src = tmpl.substitute(cols=cols, le=le) + spec = [GpuArray, 'uint32'] + k = GpuKernel(src, "extract_tri", spec, context=ctx) + return k + + +def triu(A, ctx, inplace=True): + if not inplace: + A = A.copy() + upper = True + if A.flags['F_CONTIGUOUS']: + upper = False + k = _generate_kernel(ctx, A.shape[0], upper) + k(A, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) + return A + + +def tril(A, ctx, inplace=True): + if not inplace: + A = A.copy() + upper = False + if A.flags['F_CONTIGUOUS']: + upper = True + k = _generate_kernel(ctx, A.shape[0], upper) + k(A, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) + return A diff --git a/pygpu/tests/tests_array_tools.py b/pygpu/tests/tests_array_tools.py new file mode 100644 index 0000000000..d970b18442 --- /dev/null +++ b/pygpu/tests/tests_array_tools.py @@ -0,0 +1,45 @@ +from pygpu.array_tools import (tril, triu) +from .support import (gen_gpuarray, context) +import numpy + + +def test_triu_inplace(): + ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) + result = triu(ag, context, inplace=True) + assert numpy.all(numpy.triu(ac) == result) + assert numpy.all(numpy.triu(ac) == ag) + + +def test_triu_inplace_order_f(): + ac, ag = gen_gpuarray((10, 10), 'float32', order='f', ctx=context) + result = triu(ag, context, inplace=True) + assert numpy.all(numpy.triu(ac) == result) + assert numpy.all(numpy.triu(ac) == ag) + + +def test_triu_no_inplace(): + ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) + result = triu(ag, context, inplace=False) + assert numpy.all(numpy.triu(ac) == result) + assert numpy.all(ac == ag) + + +def test_tril_inplace(): + ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) + result = tril(ag, context, inplace=True) + assert numpy.all(numpy.tril(ac) == result) + assert numpy.all(numpy.tril(ac) == ag) + + +def test_tril_inplace_order_f(): + ac, ag = gen_gpuarray((10, 10), 'float32', order='f', ctx=context) + result = tril(ag, context, inplace=True) + assert numpy.all(numpy.tril(ac) == result) + assert numpy.all(numpy.tril(ac) == ag) + + +def test_tril_no_inplace(): + ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) + result = tril(ag, context, inplace=False) + assert numpy.all(numpy.tril(ac) == result) + assert numpy.all(ac == ag) From c439e25874f995c7890b0fbc70c99b5728a03ac2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 12:07:43 -0400 Subject: [PATCH 200/597] Stop requiring nose to import pygpu and add the dependency on six. --- pygpu/__init__.py | 12 +++++------- setup.py | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pygpu/__init__.py b/pygpu/__init__.py index 74479d3c68..9184085d01 100644 --- a/pygpu/__init__.py +++ b/pygpu/__init__.py @@ -14,10 +14,8 @@ def get_include(): from .version import fullversion as __version__ -from .tests import main -if hasattr(main, "NoseTester"): - test = main.NoseTester().test -else: - def test(): - raise ImportError("The nose module is not installed." - " It is needed for pygpu tests.") + +def test(): + from .tests import main + if hasattr(main, "NoseTester"): + main.NoseTester().test() diff --git a/setup.py b/setup.py index 0084e04b4a..5e2d86c4b4 100755 --- a/setup.py +++ b/setup.py @@ -144,5 +144,5 @@ def __init__(self, *args, **kwargs): 'blas_api.h', 'numpy_compat.h', 'collectives.h', 'collectives_api.h']}, ext_modules=cythonize(exts), - install_requires=['mako>=0.7'], + requires=['mako>=0.7', 'six'], ) From a524a80345791a7d62b112e14a69e13e56e5b4bd Mon Sep 17 00:00:00 2001 From: Thomas George Date: Mon, 27 Mar 2017 10:27:51 -0400 Subject: [PATCH 201/597] triu/tril now work with non square matrices --- pygpu/{array_tools.py => basic.py} | 18 ++++++++---- pygpu/tests/test_basic.py | 31 ++++++++++++++++++++ pygpu/tests/tests_array_tools.py | 45 ------------------------------ 3 files changed, 43 insertions(+), 51 deletions(-) rename pygpu/{array_tools.py => basic.py} (76%) create mode 100644 pygpu/tests/test_basic.py delete mode 100644 pygpu/tests/tests_array_tools.py diff --git a/pygpu/array_tools.py b/pygpu/basic.py similarity index 76% rename from pygpu/array_tools.py rename to pygpu/basic.py index f7972cdfdc..5eaacd5287 100644 --- a/pygpu/array_tools.py +++ b/pygpu/basic.py @@ -25,23 +25,29 @@ def _generate_kernel(ctx, cols, upper=True): return k -def triu(A, ctx, inplace=True): +def triu(A, inplace=True): if not inplace: A = A.copy() - upper = True if A.flags['F_CONTIGUOUS']: upper = False - k = _generate_kernel(ctx, A.shape[0], upper) + cols = A.shape[0] + else: + upper = True + cols = A.shape[1] + k = _generate_kernel(A.context, cols, upper) k(A, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A -def tril(A, ctx, inplace=True): +def tril(A, inplace=True): if not inplace: A = A.copy() - upper = False if A.flags['F_CONTIGUOUS']: upper = True - k = _generate_kernel(ctx, A.shape[0], upper) + cols = A.shape[0] + else: + upper = False + cols = A.shape[1] + k = _generate_kernel(A.context, cols, upper) k(A, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A diff --git a/pygpu/tests/test_basic.py b/pygpu/tests/test_basic.py new file mode 100644 index 0000000000..1b12b30d9a --- /dev/null +++ b/pygpu/tests/test_basic.py @@ -0,0 +1,31 @@ +from pygpu.basic import (tril, triu) +from .support import (gen_gpuarray, context) +import numpy + + +def test_tril(): + for shape in [(10, 5), (5, 10), (10, 10)]: + for order in ['c', 'f']: + for inplace in [True, False]: + ac, ag = gen_gpuarray(shape, 'float32', + order=order, ctx=context) + result = tril(ag, inplace=inplace) + assert numpy.all(numpy.tril(ac) == result) + if inplace: + assert numpy.all(numpy.tril(ac) == ag) + else: + assert numpy.all(ac == ag) + + +def test_triu(): + for shape in [(10, 5), (5, 10), (10, 10)]: + for order in ['c', 'f']: + for inplace in [True, False]: + ac, ag = gen_gpuarray(shape, 'float32', + order=order, ctx=context) + result = triu(ag, inplace=inplace) + assert numpy.all(numpy.triu(ac) == result) + if inplace: + assert numpy.all(numpy.triu(ac) == ag) + else: + assert numpy.all(ac == ag) diff --git a/pygpu/tests/tests_array_tools.py b/pygpu/tests/tests_array_tools.py deleted file mode 100644 index d970b18442..0000000000 --- a/pygpu/tests/tests_array_tools.py +++ /dev/null @@ -1,45 +0,0 @@ -from pygpu.array_tools import (tril, triu) -from .support import (gen_gpuarray, context) -import numpy - - -def test_triu_inplace(): - ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) - result = triu(ag, context, inplace=True) - assert numpy.all(numpy.triu(ac) == result) - assert numpy.all(numpy.triu(ac) == ag) - - -def test_triu_inplace_order_f(): - ac, ag = gen_gpuarray((10, 10), 'float32', order='f', ctx=context) - result = triu(ag, context, inplace=True) - assert numpy.all(numpy.triu(ac) == result) - assert numpy.all(numpy.triu(ac) == ag) - - -def test_triu_no_inplace(): - ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) - result = triu(ag, context, inplace=False) - assert numpy.all(numpy.triu(ac) == result) - assert numpy.all(ac == ag) - - -def test_tril_inplace(): - ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) - result = tril(ag, context, inplace=True) - assert numpy.all(numpy.tril(ac) == result) - assert numpy.all(numpy.tril(ac) == ag) - - -def test_tril_inplace_order_f(): - ac, ag = gen_gpuarray((10, 10), 'float32', order='f', ctx=context) - result = tril(ag, context, inplace=True) - assert numpy.all(numpy.tril(ac) == result) - assert numpy.all(numpy.tril(ac) == ag) - - -def test_tril_no_inplace(): - ac, ag = gen_gpuarray((10, 10), 'float32', ctx=context) - result = tril(ag, context, inplace=False) - assert numpy.all(numpy.tril(ac) == result) - assert numpy.all(ac == ag) From 62b7546fc8fbfa662e909023774b4fb6b8fb4caf Mon Sep 17 00:00:00 2001 From: Thomas George Date: Mon, 27 Mar 2017 15:37:43 -0400 Subject: [PATCH 202/597] added assert len(shape) == 2 in triu/tril --- pygpu/basic.py | 10 +++++++--- pygpu/tests/test_basic.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/pygpu/basic.py b/pygpu/basic.py index 5eaacd5287..d7fe5fad9a 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -4,9 +4,9 @@ def _generate_kernel(ctx, cols, upper=True): tmpl = Template(""" - KERNEL void extract_tri(ga_float *a, ga_uint N) { - unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ - blockIdx.x*blockDim.x+threadIdx.x; + KERNEL void extract_tri(GLOBAL_MEM ga_float *a, ga_uint N) { + unsigned int idx = GID_1 * LDIM_0 * GDIM_0 + + GID_0 * LDIM_0 + LID_0; unsigned int ix = idx/${cols}; unsigned int iy = idx%${cols}; if (idx < N) { @@ -26,6 +26,8 @@ def _generate_kernel(ctx, cols, upper=True): def triu(A, inplace=True): + if len(A.shape) != 2: + raise ValueError("triu only works for 2d arrays") if not inplace: A = A.copy() if A.flags['F_CONTIGUOUS']: @@ -40,6 +42,8 @@ def triu(A, inplace=True): def tril(A, inplace=True): + if len(A.shape) != 2: + raise ValueError("tril only works for 2d arrays") if not inplace: A = A.copy() if A.flags['F_CONTIGUOUS']: diff --git a/pygpu/tests/test_basic.py b/pygpu/tests/test_basic.py index 1b12b30d9a..3cfb7ce9a9 100644 --- a/pygpu/tests/test_basic.py +++ b/pygpu/tests/test_basic.py @@ -1,4 +1,5 @@ from pygpu.basic import (tril, triu) +from unittest import TestCase from .support import (gen_gpuarray, context) import numpy @@ -29,3 +30,29 @@ def test_triu(): assert numpy.all(numpy.triu(ac) == ag) else: assert numpy.all(ac == ag) + + +class test_shape(TestCase): + + def runTest(self): + self.assertRaises(ValueError, self.run_1d_triu) + self.assertRaises(ValueError, self.run_3d_triu) + self.assertRaises(ValueError, self.run_1d_tril) + self.assertRaises(ValueError, self.run_3d_tril) + + def run_1d_triu(self): + ac, ag = gen_gpuarray((10, ), 'float32', ctx=context) + triu(ag) + + def run_3d_triu(self): + ac, ag = gen_gpuarray((10, 10, 10), 'float32', ctx=context) + triu(ag) + + def run_1d_tril(self): + ac, ag = gen_gpuarray((10, ), 'float32', ctx=context) + tril(ag) + + def run_3d_tril(self): + ac, ag = gen_gpuarray((10, 10, 10), 'float32', ctx=context) + tril(ag) + From cc1b189583909299e965f8718f060bf1698c2a27 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 27 Mar 2017 23:12:49 -0400 Subject: [PATCH 203/597] Fix the NoseTester invocation so that it actually works now. --- pygpu/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pygpu/__init__.py b/pygpu/__init__.py index 9184085d01..566cd8d19b 100644 --- a/pygpu/__init__.py +++ b/pygpu/__init__.py @@ -16,6 +16,7 @@ def get_include(): def test(): + from . import tests from .tests import main if hasattr(main, "NoseTester"): - main.NoseTester().test() + main.NoseTester(package=tests).test() From 757dc608bdf5df0304bf7321a6480a9e1345576d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 28 Mar 2017 10:48:50 -0400 Subject: [PATCH 204/597] Also fix the setup script. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5e2d86c4b4..a743053903 100755 --- a/setup.py +++ b/setup.py @@ -144,5 +144,5 @@ def __init__(self, *args, **kwargs): 'blas_api.h', 'numpy_compat.h', 'collectives.h', 'collectives_api.h']}, ext_modules=cythonize(exts), - requires=['mako>=0.7', 'six'], + install_requires=['mako>=0.7', 'six'], ) From d0de2b146318ac943996432cf32ea48dd167b28f Mon Sep 17 00:00:00 2001 From: Thomas George Date: Tue, 28 Mar 2017 13:38:10 -0400 Subject: [PATCH 205/597] check that A is contiguous in tril/triu --- pygpu/basic.py | 6 ++++++ pygpu/tests/test_basic.py | 20 +++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pygpu/basic.py b/pygpu/basic.py index d7fe5fad9a..01658da257 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -28,6 +28,9 @@ def _generate_kernel(ctx, cols, upper=True): def triu(A, inplace=True): if len(A.shape) != 2: raise ValueError("triu only works for 2d arrays") + if A.flags.c_contiguous is A.flags.f_contiguous is False: + raise ValueError("triu only works for contiguous arrays") + if not inplace: A = A.copy() if A.flags['F_CONTIGUOUS']: @@ -44,6 +47,9 @@ def triu(A, inplace=True): def tril(A, inplace=True): if len(A.shape) != 2: raise ValueError("tril only works for 2d arrays") + if A.flags.c_contiguous is A.flags.f_contiguous is False: + raise ValueError("tril only works for contiguous arrays") + if not inplace: A = A.copy() if A.flags['F_CONTIGUOUS']: diff --git a/pygpu/tests/test_basic.py b/pygpu/tests/test_basic.py index 3cfb7ce9a9..17361278b4 100644 --- a/pygpu/tests/test_basic.py +++ b/pygpu/tests/test_basic.py @@ -1,3 +1,5 @@ +import pygpu + from pygpu.basic import (tril, triu) from unittest import TestCase from .support import (gen_gpuarray, context) @@ -32,7 +34,7 @@ def test_triu(): assert numpy.all(ac == ag) -class test_shape(TestCase): +class test_errors(TestCase): def runTest(self): self.assertRaises(ValueError, self.run_1d_triu) @@ -40,6 +42,9 @@ def runTest(self): self.assertRaises(ValueError, self.run_1d_tril) self.assertRaises(ValueError, self.run_3d_tril) + self.assertRaises(ValueError, self.run_noncontiguous_tril) + self.assertRaises(ValueError, self.run_noncontiguous_triu) + def run_1d_triu(self): ac, ag = gen_gpuarray((10, ), 'float32', ctx=context) triu(ag) @@ -56,3 +61,16 @@ def run_3d_tril(self): ac, ag = gen_gpuarray((10, 10, 10), 'float32', ctx=context) tril(ag) + def run_noncontiguous_tril(self): + a = numpy.random.rand(5, 5) + a = a[::-1] + b = pygpu.array(a, context=context) + assert b.flags.c_contiguous is b.flags.f_contiguous is False + tril(b) + + def run_noncontiguous_triu(self): + a = numpy.random.rand(5, 5) + a = a[::-1] + b = pygpu.array(a, context=context) + assert b.flags.c_contiguous is b.flags.f_contiguous is False + triu(b) From d07f4a122e9e25ec89c10a945657a07298eeb544 Mon Sep 17 00:00:00 2001 From: Thomas George Date: Tue, 28 Mar 2017 15:31:30 -0400 Subject: [PATCH 206/597] Changed len(A.shape) to A.ndim in triu/tril --- pygpu/basic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygpu/basic.py b/pygpu/basic.py index 01658da257..c66ea38f86 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -26,7 +26,7 @@ def _generate_kernel(ctx, cols, upper=True): def triu(A, inplace=True): - if len(A.shape) != 2: + if A.ndim != 2: raise ValueError("triu only works for 2d arrays") if A.flags.c_contiguous is A.flags.f_contiguous is False: raise ValueError("triu only works for contiguous arrays") @@ -45,7 +45,7 @@ def triu(A, inplace=True): def tril(A, inplace=True): - if len(A.shape) != 2: + if A.ndim != 2: raise ValueError("tril only works for 2d arrays") if A.flags.c_contiguous is A.flags.f_contiguous is False: raise ValueError("tril only works for contiguous arrays") From fb4eecc8a771ccef930eb613b9301d713f15a104 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 30 Mar 2017 14:45:16 -0400 Subject: [PATCH 207/597] Include "util.h" in "array.h". --- src/gpuarray/array.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index 966290be1d..271b8a1d7a 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -6,6 +6,7 @@ */ #include +#include #ifdef _MSC_VER #ifndef inline From 9e80f85dcac94bedaea6857772a26864aab207e3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Dec 2016 07:54:14 +0100 Subject: [PATCH 208/597] Add a strb method to read from a file. --- src/util/strb.c | 18 +++++++++++++++++- src/util/strb.h | 7 +++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/util/strb.c b/src/util/strb.c index b202b5065c..273aa8fa6e 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -1,5 +1,7 @@ - +#include #include +#include + #include "util/strb.h" strb *strb_alloc(size_t i) { @@ -55,3 +57,17 @@ void strb_appendf(strb *sb, const char *f, ...) { va_end(ap); sb->l += s; } + +void strb_read(strb *sb, int fd, size_t sz) { + ssize_t res; + char *b; + if (strb_ensure(sb, sz)) return; + b = sb->s + sb->l; + sb->l += sz; + while (sz) { + res = read(fd, b, sz); + if (res == -1 && !(errno == EAGAIN || errno == EINTR)) { strb_seterror(sb); return; } + sz -= (size_t)res; + b += (size_t)res; + } +} diff --git a/src/util/strb.h b/src/util/strb.h index b2f18449d7..490031969d 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -161,6 +161,13 @@ static inline void strb_appendb(strb *sb, strb *sb2) { */ GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...); +/* + * Reads from the file specified by the given file descriptor. + * + * A read error will place the strb in error mode. + */ +GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz); + /* * Returns a C string from the content of the strb. * From ea6dfb6b82f8c5e6a274658331f94567f76ce91f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Dec 2016 06:49:00 +0100 Subject: [PATCH 209/597] Import skein, stripping away everything but skein512-512 and the portable goo. --- src/util/CMakeLists.txt | 1 + src/util/skein.c | 309 ++++++++++++++++++++++++++++++++++++++++ src/util/skein.h | 145 +++++++++++++++++++ 3 files changed, 455 insertions(+) create mode 100644 src/util/skein.c create mode 100644 src/util/skein.h diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 61a603b44a..5c21cc3e90 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -2,4 +2,5 @@ set_rel(UTIL_SRC strb.c xxhash.c integerfactoring.c +skein.c ) diff --git a/src/util/skein.c b/src/util/skein.c new file mode 100644 index 0000000000..50285ea2c9 --- /dev/null +++ b/src/util/skein.c @@ -0,0 +1,309 @@ +/*********************************************************************** +** +** Implementation of the Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ + +#include /* get the memcpy/memset functions */ +#include "skein.h" /* get the Skein API definitions */ + +#define MK_64 SKEIN_MK_64 + +/* blkSize = 512 bits. hashSize = 512 bits */ +static const u64b_t SKEIN_512_IV_512[] = + { + MK_64(0x4903ADFF,0x749C51CE), + MK_64(0x0D95DE39,0x9746DF03), + MK_64(0x8FD19341,0x27C79BCE), + MK_64(0x9A255629,0xFF352CB1), + MK_64(0x5DB62599,0xDF6CA7B0), + MK_64(0xEABE394C,0xA9D5C3F4), + MK_64(0x991112C7,0x1A75B523), + MK_64(0xAE18A40B,0x660FCC33) + }; + +static void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) { + size_t n; + + for (n = 0; n < bCnt; n++) + dst[n] = (u08b_t)(src[n>>3] >> (8*(n&7))); +} + +static void Skein_Get64_LSB_First(u64b_t *dst, const u08b_t *src, + size_t wCnt) { + size_t n; + + for (n=0; n<8*wCnt; n+=8) + dst[n/8] = (((u64b_t) src[n ])) + + (((u64b_t) src[n+1]) << 8) + + (((u64b_t) src[n+2]) << 16) + + (((u64b_t) src[n+3]) << 24) + + (((u64b_t) src[n+4]) << 32) + + (((u64b_t) src[n+5]) << 40) + + (((u64b_t) src[n+6]) << 48) + + (((u64b_t) src[n+7]) << 56) ; +} + +static u64b_t Skein_Swap64(u64b_t in) { + u64b_t o; + u08b_t *out = (u08b_t *)&o; + out[7] = in >> 56; + out[6] = in >> 48; + out[5] = in >> 40; + out[4] = in >> 32; + out[3] = in >> 24; + out[2] = in >> 16; + out[1] = in >> 8; + out[0] = in; + return o; +} + +/*****************************************************************/ +/* Function to process blkCnt (nonzero) full block(s) of data. */ +#define BLK_BITS (WCNT*64) /* some useful definitions for \ + code here */ +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N)))) + +static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const u08b_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) { + enum { + WCNT = SKEIN_512_STATE_WORDS + }; +#define RCNT (SKEIN_512_ROUNDS_TOTAL/8) + + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ + u64b_t w [WCNT]; /* local copy of input block */ + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + /* run the rounds */ +#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ + +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \ + Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) + +#define I512(R) \ + X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \ + X1 += ks[((R)+2) % 9]; \ + X2 += ks[((R)+3) % 9]; \ + X3 += ks[((R)+4) % 9]; \ + X4 += ks[((R)+5) % 9]; \ + X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ + X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ + X7 += ks[((R)+8) % 9] + (R)+1; + + { + +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ + R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ + R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ + R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ + I512(2*(R)); \ + R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ + R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ + R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ + R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ + I512(2*(R)+1); /* and key injection */ + + R512_8_rounds( 0); + +#define R512_Unroll_R(NN) (SKEIN_512_ROUNDS_TOTAL/8 > (NN)) + + #if R512_Unroll_R( 1) + R512_8_rounds( 1); + #endif + #if R512_Unroll_R( 2) + R512_8_rounds( 2); + #endif + #if R512_Unroll_R( 3) + R512_8_rounds( 3); + #endif + #if R512_Unroll_R( 4) + R512_8_rounds( 4); + #endif + #if R512_Unroll_R( 5) + R512_8_rounds( 5); + #endif + #if R512_Unroll_R( 6) + R512_8_rounds( 6); + #endif + #if R512_Unroll_R( 7) + R512_8_rounds( 7); + #endif + #if R512_Unroll_R( 8) + R512_8_rounds( 8); + #endif + #if R512_Unroll_R( 9) + R512_8_rounds( 9); + #endif + #if R512_Unroll_R(10) + R512_8_rounds(10); + #endif + #if R512_Unroll_R(11) + R512_8_rounds(11); + #endif + #if R512_Unroll_R(12) + R512_8_rounds(12); + #endif + #if R512_Unroll_R(13) + R512_8_rounds(13); + #endif + #if R512_Unroll_R(14) + R512_8_rounds(14); + #endif + } + + /* do the final "feedforward" xor, update context chaining vars */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +/*****************************************************************/ +/* 512-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein_512_Init(Skein_512_Ctxt_t *ctx) { + ctx->h.hashBitLen = 512; /* output hash bit count */ + memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); + + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, + size_t msgByteCnt) { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { + if (ctx->h.bCnt) { /* finish up any buffered message data */ + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) { + n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ + Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) { + size_t i,n,byteCnt; + u64b_t X[SKEIN_512_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + + Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; +} diff --git a/src/util/skein.h b/src/util/skein.h new file mode 100644 index 0000000000..f21e64409a --- /dev/null +++ b/src/util/skein.h @@ -0,0 +1,145 @@ +#ifndef _SKEIN_H_ +#define _SKEIN_H_ 1 +/************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** +** +** The following compile-time switches may be defined to control some +** tradeoffs between speed, code size, error checking, and security. +** +** The "default" note explains what happens when the switch is not defined. +** +** SKEIN_ERR_CHECK -- how error checking is handled inside Skein +** code. If not defined, most error checking +** is disabled (for performance). Otherwise, +** the switch value is interpreted as: +** 0: use assert() to flag errors +** 1: return SKEIN_FAIL to flag errors +** +***************************************************************************/ +#ifdef __cplusplus +extern "C" +{ +#endif + +#include /* get size_t definition */ +#include +typedef unsigned int uint_t; +typedef uint8_t u08b_t; +typedef uint64_t u64b_t; + +enum { + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1 +}; + +#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ + +#define SKEIN_512_STATE_WORDS ( 8) + +#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) + +typedef struct { + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ +} Skein_Ctxt_Hdr_t; + +typedef struct { /* 512-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ +} Skein_512_Ctxt_t; + +/* Skein APIs for (incremental) "straight hashing" */ +int Skein_512_Init (Skein_512_Ctxt_t *ctx); +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); + +/***************************************************************** +** "Internal" Skein definitions +** -- not needed for sequential hashing API, but will be +** helpful for other uses of Skein (e.g., tree hash mode). +** -- included here so that they can be shared between +** reference and optimized code. +******************************************************************/ + +/* tweak word T[1]: bit field starting positions */ +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ + +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ + +/* tweak word T[1]: flag bit definition(s) */ +#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL) + +/* tweak word T[1]: block type field */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ + +#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ + +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) + +/* +** Skein macros for setting tweak words, etc. +**/ +#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} + +#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) +#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) + +/* set both tweak words at once */ +#define Skein_Set_T0_T1(ctxPtr,T0,T1) \ + { \ + Skein_Set_T0(ctxPtr,(T0)); \ + Skein_Set_T1(ctxPtr,(T1)); \ + } + +/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */ +#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ + { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } + +/************************************************** +** "Internal" Skein definitions for error checking +***************************************************/ + +#include +#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */ +#define Skein_assert(x) assert(x) /* internal error */ + +/***************************************************************** +** Skein block function constants (shared across Ref and Opt code) +******************************************************************/ +enum { + /* Skein_512 round rotation constants */ + R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, + R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, + R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, + R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, + R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, + R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, + R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, + R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22, +}; + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _SKEIN_H_ */ From 0cb043f2fcf72396c85088dfb36897553353d777 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Dec 2016 12:21:55 +0100 Subject: [PATCH 210/597] Add a function to do all the steps. --- src/util/skein.c | 8 ++++++++ src/util/skein.h | 1 + 2 files changed, 9 insertions(+) diff --git a/src/util/skein.c b/src/util/skein.c index 50285ea2c9..51362e5efb 100644 --- a/src/util/skein.c +++ b/src/util/skein.c @@ -307,3 +307,11 @@ int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) { } return SKEIN_SUCCESS; } + +int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal) { + Skein_512_Ctxt_t ctx; + if (Skein_512_Init(&ctx)) return SKEIN_FAIL; + if (Skein_512_Update(&ctx, msg, msgByteCnt)) return SKEIN_FAIL; + if (Skein_512_Final(&ctx, hashVal)) return SKEIN_FAIL; + return SKEIN_SUCCESS; +} diff --git a/src/util/skein.h b/src/util/skein.h index f21e64409a..89d7ebf209 100644 --- a/src/util/skein.h +++ b/src/util/skein.h @@ -63,6 +63,7 @@ typedef struct { /* 512-bit Skein hash context structure */ int Skein_512_Init (Skein_512_Ctxt_t *ctx); int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal); /***************************************************************** ** "Internal" Skein definitions From dd06f091bbc518883a5e117a931ef9b9008aaecc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 21:06:31 +0100 Subject: [PATCH 211/597] Add strb_write(). --- src/util/strb.c | 24 +++++++++++++++++++++++- src/util/strb.h | 7 +++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/util/strb.c b/src/util/strb.c index 273aa8fa6e..15cd496c4f 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -66,8 +66,30 @@ void strb_read(strb *sb, int fd, size_t sz) { sb->l += sz; while (sz) { res = read(fd, b, sz); - if (res == -1 && !(errno == EAGAIN || errno == EINTR)) { strb_seterror(sb); return; } + if (res == -1) { + if (errno == EAGAIN || errno == EINTR) + continue; + strb_seterror(sb); + return; + } sz -= (size_t)res; b += (size_t)res; } } + +int strb_write(int fd, strb *sb) { + ssize_t res; + size_t l = sb->l; + char *b = sb->s; + while (l) { + res = write(fd, b, l); + if (res == -1) { + if (errno == EAGAIN || errno == EINTR) + continue; + return -1; + } + l -= (size_t)res; + b += (size_t)res; + } + return 0; +} diff --git a/src/util/strb.h b/src/util/strb.h index 490031969d..3fc1071ea4 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -168,6 +168,13 @@ GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...); */ GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz); +/* + * Write the content of an strb to the specified file descriptor. + * + * Write errors will be signaled by a nonzero return value. + */ +GPUARRAY_LOCAL int strb_write(int fd, strb *sb); + /* * Returns a C string from the content of the strb. * From c59ddae81487b2d71196f0627d205cbb7dc6714b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 21:55:58 +0100 Subject: [PATCH 212/597] Fix wrong export type. --- src/gpuarray/buffer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index d6d3dd8a09..800756a072 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -328,9 +328,9 @@ GPUARRAY_PUBLIC int gpudata_move(gpudata *dst, size_t dstoff, * \returns the new buffer in dst_ctx or NULL if no efficient way to * transfer could be found. */ -GPUARRAY_LOCAL int gpudata_transfer(gpudata *dst, size_t dstoff, - gpudata *src, size_t srcoff, - size_t sz); +GPUARRAY_PUBLIC int gpudata_transfer(gpudata *dst, size_t dstoff, + gpudata *src, size_t srcoff, + size_t sz); /** * Transfer data from a buffer to memory. From b12013bf410ea1d7fb6b1a4665a6dc353b530b4f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 22:02:39 +0100 Subject: [PATCH 213/597] Remove GPUARRAY_LOCAL, it is not needed since we default to visibility=hidden --- src/gpuarray/config.h | 6 +----- src/gpuarray_blas_cuda_cublas.c | 2 +- src/gpuarray_blas_opencl_clblas.c | 2 +- src/gpuarray_blas_opencl_clblast.c | 2 +- src/gpuarray_buffer_cuda.c | 3 +-- src/gpuarray_buffer_opencl.c | 3 +-- src/gpuarray_collectives_cuda_nccl.c | 2 +- src/private.h | 28 ++++++++++++++-------------- src/private_config.h.in | 6 +++--- src/private_cuda.h | 17 ++++++++--------- src/private_opencl.h | 8 ++++---- src/util/strb.h | 12 ++++++------ src/util/xxhash.h | 8 ++++---- 13 files changed, 46 insertions(+), 53 deletions(-) diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index f8fc86a01d..571f81cfe6 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -12,19 +12,15 @@ #else #define GPUARRAY_PUBLIC __declspec(dllimport) #endif - #define GPUARRAY_LOCAL #else #if __GNUC__ >= 4 #define GPUARRAY_PUBLIC __attribute__((visibility ("default"))) - #define GPUARRAY_LOCAL __attribute__((visibility ("hidden"))) #else - #define GPUARRAY_PUBLIC - #define GPUARRAY_LOCAL + #error "Don't know how to export symbols on this platform" #endif #endif #else #define GPUARRAY_PUBLIC - #define GPUARRAY_LOCAL #endif #ifdef _MSC_VER diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 6d4648e232..a7b91ed87e 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -1640,7 +1640,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, return GA_NO_ERROR; } -GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = { +gpuarray_blas_ops cublas_ops = { setup, teardown, error, diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 2041710735..8ee019afb7 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -491,7 +491,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, return GA_NO_ERROR; } -GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = { +gpuarray_blas_ops clblas_ops = { setup, teardown, error, diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 78cca10f20..4a5369e56e 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -525,7 +525,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, return GA_NO_ERROR; } -GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { +gpuarray_blas_ops clblast_ops = { setup, teardown, error, diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 447400f277..1c883deaa7 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -39,7 +39,7 @@ STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcme static CUresult err; -GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops; +const gpuarray_buffer_ops cuda_ops; static void cuda_freekernel(gpukernel *); static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *); @@ -1689,7 +1689,6 @@ static const char *cuda_error(gpucontext *c) { return errstr; } -GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_get_device_count, cuda_init, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index adf34a3825..84bcb6584d 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -28,7 +28,7 @@ static cl_int err; #define CHKFAIL(v) if (err != CL_SUCCESS) FAIL(v, GA_IMPL_ERROR) -GPUARRAY_LOCAL const gpuarray_buffer_ops opencl_ops; +const gpuarray_buffer_ops opencl_ops; static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r); static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags, @@ -1448,7 +1448,6 @@ static const char *cl_error(gpucontext *c) { } } -GPUARRAY_LOCAL const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_get_device_count, cl_init, diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index a0f6d12060..e382cfa066 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -455,6 +455,6 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest, * linked in \ref gpuarray_buffer_cuda.c, in order to fill a /ref gpucontext's * comm_ops. */ -GPUARRAY_LOCAL gpuarray_comm_ops nccl_ops = { +gpuarray_comm_ops nccl_ops = { comm_new, comm_free, generate_clique_id, get_count, get_rank, reduce, all_reduce, reduce_scatter, broadcast, all_gather}; diff --git a/src/private.h b/src/private.h index 0513df8605..7405cf8995 100644 --- a/src/private.h +++ b/src/private.h @@ -256,26 +256,26 @@ static inline void *memdup(const void *p, size_t s) { return res; } -GPUARRAY_LOCAL int GpuArray_is_c_contiguous(const GpuArray *a); -GPUARRAY_LOCAL int GpuArray_is_f_contiguous(const GpuArray *a); -GPUARRAY_LOCAL int GpuArray_is_aligned(const GpuArray *a); +int GpuArray_is_c_contiguous(const GpuArray *a); +int GpuArray_is_f_contiguous(const GpuArray *a); +int GpuArray_is_aligned(const GpuArray *a); -GPUARRAY_LOCAL extern const gpuarray_type scalar_types[]; -GPUARRAY_LOCAL extern const gpuarray_type vector_types[]; +extern const gpuarray_type scalar_types[]; +extern const gpuarray_type vector_types[]; /* * This function generates the kernel code to perform indexing on var id * from planar index 'i' using the dimensions and strides provided. */ -GPUARRAY_LOCAL void gpuarray_elem_perdim(strb *sb, unsigned int nd, - const size_t *dims, - const ssize_t *str, - const char *id); - -GPUARRAY_LOCAL void gpukernel_source_with_line_numbers(unsigned int count, - const char **news, - size_t *newl, - strb *src); +void gpuarray_elem_perdim(strb *sb, unsigned int nd, + const size_t *dims, + const ssize_t *str, + const char *id); + +void gpukernel_source_with_line_numbers(unsigned int count, + const char **news, + size_t *newl, + strb *src); static inline uint16_t float_to_half(float value) { #define ga__shift 13 diff --git a/src/private_config.h.in b/src/private_config.h.in index 23db862c4f..c3cd3a0195 100644 --- a/src/private_config.h.in +++ b/src/private_config.h.in @@ -39,12 +39,12 @@ extern "C" { #define nelems(a) (sizeof(a)/sizeof(a[0])) #ifndef HAVE_MKSTEMP -GPUARRAY_LOCAL int mkstemp(char *path); +int mkstemp(char *path); #endif #ifndef HAVE_STRL -GPUARRAY_LOCAL size_t strlcpy(char *dst, const char *src, size_t size); -GPUARRAY_LOCAL size_t strlcat(char *dst, const char *src, size_t size); +size_t strlcpy(char *dst, const char *src, size_t size); +size_t strlcat(char *dst, const char *src, size_t size); #endif #ifdef __cplusplus diff --git a/src/private_cuda.h b/src/private_cuda.h index 6fab1597ac..da6f60ad7a 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -93,10 +93,10 @@ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext), #define ARCH_PREFIX "compute_" -GPUARRAY_LOCAL cuda_context *cuda_make_ctx(CUcontext ctx, int flags); -GPUARRAY_LOCAL CUstream cuda_get_stream(cuda_context *ctx); -GPUARRAY_LOCAL void cuda_enter(cuda_context *ctx); -GPUARRAY_LOCAL void cuda_exit(cuda_context *ctx); +cuda_context *cuda_make_ctx(CUcontext ctx, int flags); +CUstream cuda_get_stream(cuda_context *ctx); +void cuda_enter(cuda_context *ctx); +void cuda_exit(cuda_context *ctx); struct _gpudata { CUdeviceptr ptr; @@ -115,11 +115,10 @@ struct _gpudata { #endif }; -GPUARRAY_LOCAL gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p, - size_t sz); -GPUARRAY_LOCAL size_t cuda_get_sz(gpudata *g); -GPUARRAY_LOCAL int cuda_wait(gpudata *, int); -GPUARRAY_LOCAL int cuda_record(gpudata *, int); +gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p, size_t sz); +size_t cuda_get_sz(gpudata *g); +int cuda_wait(gpudata *, int); +int cuda_record(gpudata *, int); /* private flags are in the upper 16 bits */ #define CUDA_WAIT_READ 0x10000 diff --git a/src/private_opencl.h b/src/private_opencl.h index e40242d57e..2a523f5bda 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -67,9 +67,9 @@ struct _gpukernel { #endif }; -GPUARRAY_LOCAL cl_ctx *cl_make_ctx(cl_context ctx, int flags); -GPUARRAY_LOCAL cl_command_queue cl_get_stream(gpucontext *ctx); -GPUARRAY_LOCAL gpudata *cl_make_buf(gpucontext *c, cl_mem buf); -GPUARRAY_LOCAL cl_mem cl_get_buf(gpudata *g); +cl_ctx *cl_make_ctx(cl_context ctx, int flags); +cl_command_queue cl_get_stream(gpucontext *ctx); +gpudata *cl_make_buf(gpucontext *c, cl_mem buf); +cl_mem cl_get_buf(gpudata *g); #endif diff --git a/src/util/strb.h b/src/util/strb.h index 3fc1071ea4..267941417e 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -39,14 +39,14 @@ typedef struct _strb { * * Returns NULL on error. */ -GPUARRAY_LOCAL strb *strb_alloc(size_t s); +strb *strb_alloc(size_t s); /* * Frees an strb that was dynamically allocated. * * Don't call this for stack of global declarations, see strb_clear() instead. */ -GPUARRAY_LOCAL void strb_free(strb *); +void strb_free(strb *); /* * Return a pointer to a dynamically allocated strb with a default @@ -96,7 +96,7 @@ static inline void strb_clear(strb *sb) { * This should almost never be called directly. Use strb_ensure() * instead. */ -GPUARRAY_LOCAL int strb_grow(strb *, size_t s); +int strb_grow(strb *, size_t s); /* * Make sure there is space to store at least `s` bytes of data after @@ -159,21 +159,21 @@ static inline void strb_appendb(strb *sb, strb *sb2) { * * A format error will place the strb in error mode. */ -GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...); +void strb_appendf(strb *, const char *f, ...); /* * Reads from the file specified by the given file descriptor. * * A read error will place the strb in error mode. */ -GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz); +void strb_read(strb *, int fd, size_t sz); /* * Write the content of an strb to the specified file descriptor. * * Write errors will be signaled by a nonzero return value. */ -GPUARRAY_LOCAL int strb_write(int fd, strb *sb); +int strb_write(int fd, strb *sb); /* * Returns a C string from the content of the strb. diff --git a/src/util/xxhash.h b/src/util/xxhash.h index c33938234d..6403b9e6f9 100644 --- a/src/util/xxhash.h +++ b/src/util/xxhash.h @@ -106,7 +106,7 @@ They will be automatically translated by this header. * Simple Hash Functions *****************************/ -GPUARRAY_LOCAL unsigned int XXH32 (const void* input, size_t length, unsigned seed); +unsigned int XXH32 (const void* input, size_t length, unsigned seed); /* XXH32() : @@ -129,9 +129,9 @@ These structures allow static allocation of XXH states. States must then be initialized using XXH32_reset() before first use. */ -GPUARRAY_LOCAL XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); -GPUARRAY_LOCAL XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); -GPUARRAY_LOCAL unsigned int XXH32_digest (const XXH32_state_t* statePtr); +XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); +XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +unsigned int XXH32_digest (const XXH32_state_t* statePtr); /* These functions calculate the xxHash of an input provided in multiple smaller packets, From 339ac160824a3c916614f8133b1526f39a5aead6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 22:04:47 +0100 Subject: [PATCH 214/597] Remove some useless stuff in xxhash. --- src/util/xxhash.c | 64 +---------------------------------------------- src/util/xxhash.h | 28 --------------------- 2 files changed, 1 insertion(+), 91 deletions(-) diff --git a/src/util/xxhash.c b/src/util/xxhash.c index 58101b0902..bd2447ca16 100644 --- a/src/util/xxhash.c +++ b/src/util/xxhash.c @@ -31,39 +31,6 @@ You can contact the author at : - xxHash source repository : https://github.com/Cyan4973/xxHash */ - -/************************************** -* Tuning parameters -**************************************/ -/* XXH_FORCE_MEMORY_ACCESS - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets which generate assembly depending on alignment. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://stackoverflow.com/a/32095106/646947 for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define XXH_FORCE_MEMORY_ACCESS 2 -# elif defined(__INTEL_COMPILER) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define XXH_FORCE_MEMORY_ACCESS 1 -# endif -#endif - -/* XXH_ACCEPT_NULL_INPUT_POINTER : - * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. - * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. - * By default, this option is disabled. To enable it, uncomment below define : - */ -/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */ - /* XXH_FORCE_NATIVE_FORMAT : * By default, xxHash library provides endian-independant Hash values, based on little-endian convention. * Results are therefore identical for little-endian and big-endian CPU. @@ -72,7 +39,7 @@ You can contact the author at : * to improve speed for Big-endian CPU. * This option has no impact on Little_Endian CPU. */ -#define XXH_FORCE_NATIVE_FORMAT 0 +#define XXH_FORCE_NATIVE_FORMAT 1 /* XXH_USELESS_ALIGN_BRANCH : * This is a minor performance trick, only useful with lots of very small keys. @@ -132,25 +99,6 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp #endif -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ -static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign; - -static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } - -#else - -/* portable and safe solution. Generally efficient. - * see : http://stackoverflow.com/a/32095106/646947 - */ - static U32 XXH_read32(const void* memPtr) { U32 val; @@ -158,8 +106,6 @@ static U32 XXH_read32(const void* memPtr) return val; } -#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS - /****************************************** * Compiler-specific Functions and Macros @@ -243,14 +189,6 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH U32 h32; #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (p==NULL) - { - len=0; - bEnd=p=(const BYTE*)(size_t)16; - } -#endif - if (len>=16) { const BYTE* const limit = bEnd - 16; diff --git a/src/util/xxhash.h b/src/util/xxhash.h index 6403b9e6f9..1d11a095fb 100644 --- a/src/util/xxhash.h +++ b/src/util/xxhash.h @@ -74,34 +74,6 @@ extern "C" { typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; -/***************************** -* Namespace Emulation -*****************************/ -/* Motivations : - -If you need to include xxHash into your library, -but wish to avoid xxHash symbols to be present on your library interface -in an effort to avoid potential name collision if another library also includes xxHash, - -you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash -with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values). - -Note that no change is required within the calling program : -it can still call xxHash functions using their regular name. -They will be automatically translated by this header. -*/ -#ifdef XXH_NAMESPACE -# define XXH_CAT(A,B) A##B -# define XXH_NAME2(A,B) XXH_CAT(A,B) -# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) -# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) -# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) -# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) -# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) -# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) -#endif - - /***************************** * Simple Hash Functions *****************************/ From 7ca34ed68908a348d3c40d80205556d369f321e0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 22:46:22 +0100 Subject: [PATCH 215/597] Disk cache implementation. --- src/CMakeLists.txt | 1 + src/cache/disk.c | 301 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 302 insertions(+) create mode 100644 src/cache/disk.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 02e32eccd4..0ed776533d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,6 +35,7 @@ endmacro() set(_GPUARRAY_SRC cache/lru.c cache/twoq.c +cache/disk.c gpuarray_types.c gpuarray_error.c gpuarray_util.c diff --git a/src/cache/disk.c b/src/cache/disk.c new file mode 100644 index 0000000000..59e6f1ba42 --- /dev/null +++ b/src/cache/disk.c @@ -0,0 +1,301 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cache.h" +#include "private_config.h" +#include "util/strb.h" +#include "util/skein.h" + +#define HEXP_LEN (128 + 2) + +typedef int (*kwrite_fn)(strb *res, cache_key_t key); +typedef int (*vwrite_fn)(strb *res, cache_value_t val); +typedef cache_key_t (*kread_fn)(const strb *b); +typedef cache_value_t (*vread_fn)(const strb *b); + +typedef struct _disk_cache { + cache c; + cache * mem; + kwrite_fn kwrite; + vwrite_fn vwrite; + kread_fn kread; + vread_fn vread; + int dirfd; +} disk_cache; + + +static unsigned long long ntohull(const char *in) { + return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 | + (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 | + (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 | + (unsigned long long)in[6] << 8 | (unsigned long long)in[7]); +} + +static void htonull(unsigned long long in, char *out) { + out[0] = in >> 56; + out[1] = in >> 48; + out[2] = in >> 40; + out[3] = in >> 32; + out[4] = in >> 24; + out[5] = in >> 16; + out[6] = in >> 8; + out[7] = in; +} + +static int mkstempat(int dfd, char *template) { + static const char letters[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + size_t length; + char *XXXXXX; + struct timeval tv; + unsigned long long randnum, working; + int i, tries, fd; + + length = strlen(template); + if (length < 6) { + errno = EINVAL; + return -1; + } + XXXXXX = template + length - 6; + if (strcmp(XXXXXX, "XXXXXX") != 0) { + errno = EINVAL; + return -1; + } + + /* This is kind of crappy, but the point is to not step on each + other's feet */ + gettimeofday(&tv, NULL); + randnum = ((unsigned long long) tv.tv_usec << 16) ^ tv.tv_sec ^ getpid(); + + for (tries = 0; tries < TMP_MAX; tries++) { + for (working = randnum, i = 0; i < 6; i++) { + XXXXXX[i] = letters[working % 62]; + working /= 62; + } + fd = openat(dfd, template, O_RDWR | O_CREAT | O_EXCL, 0600); + if (fd >= 0 || (errno != EEXIST && errno != EISDIR)) + return fd; + + randnum += (tv.tv_usec >> 10) & 0xfff; + } + errno = EEXIST; + return -1; +} + +static int key_path(disk_cache *c, const cache_key_t key, char *out) { + strb kb = STRB_STATIC_INIT; + unsigned char hash[64]; + int i; + + if (c->kwrite(&kb, key)) return -1; + if (Skein_512((unsigned char *)kb.s, kb.l, hash)) return -1; + if (snprintf(out, 6, "%02x%02x/%02x%02x", + hash[0], hash[1], hash[2], hash[3]) != 5) + return -1; + for (i = 4; i < 64; i += 4) { + if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x", + hash[i], hash[i+1], hash[i+2], hash[i+3]) != 8) + return -1; + } + return 0; +} + +static int write_entry(disk_cache *c, const cache_key_t k, + const cache_value_t v) { + char hexp[HEXP_LEN]; + char tmp_path[] = "tmp.XXXXXXXX"; + strb b = STRB_STATIC_INIT; + size_t kl, vl; + int fd, err; + + if (key_path(c, k, hexp)) return -1; + + if (!strb_ensure(&b, 16)) return -1; + b.l = 16; + c->kwrite(&b, k); + kl = b.l - 16; + c->vwrite(&b, v); + vl = b.l - kl - 16; + htonull(kl, b.s); + htonull(vl, b.s + 8); + if (strb_error(&b)) { + strb_clear(&b); + return -1; + } + + fd = mkstempat(c->dirfd, tmp_path); + if (fd == -1) { + strb_clear(&b); + return -1; + } + + err = strb_write(fd, &b); + strb_clear(&b); + close(fd); + if (err) { + unlinkat(c->dirfd, tmp_path, 0); + return -1; + } + + if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) { + unlinkat(c->dirfd, tmp_path, 0); + return -1; + } + + return 0; +} + +static int find_entry(disk_cache *c, const cache_key_t key, + cache_key_t *_k, cache_value_t *_v) { + struct stat st; + strb b = STRB_STATIC_INIT; + char *ts; + size_t kl, vl; + cache_key_t k; + char hexp[HEXP_LEN]; + int fd; + + if (key_path(c, key, hexp)) return 0; + + fd = openat(c->dirfd, hexp, O_RDONLY); + + if (fd == -1) return 0; + + if (fstat(fd, &st)) { + close(fd); + return 0; + } + + if (!(st.st_mode & S_IFREG)) { + close(fd); + return 0; + } + + strb_read(&b, fd, st.st_size); + close(fd); + + if (strb_error(&b) || b.l < 16) { + strb_clear(&b); + return 0; + } + + kl = ntohull(b.s); + vl = ntohull(b.s + 8); + + if (b.l < 16 + kl + vl) { + strb_clear(&b); + return 0; + } + + ts = b.s; + + b.s += 16; + b.l = kl; + + k = c->kread(&b); + if (k && c->c.keq(key, k)) { + if (_v) { + b.s += kl; + b.l = vl; + *_v = c->vread(&b); + if (*_v == NULL) + goto error; + } + if (_k) + *_k = k; + else + c->c.kfree(k); + b.s = ts; + strb_clear(&b); + return 1; + } + error: + c->c.kfree(k); + b.s = ts; + strb_clear(&b); + return 0; +} + +static int disk_add(cache *_c, cache_key_t k, cache_value_t v) { + disk_cache *c = (disk_cache *)_c; + + /* Ignore write errors */ + write_entry(c, k, v); + + return cache_add(c->mem, k, v); +} + +static int disk_del(cache *_c, const cache_key_t key) { + disk_cache *c = (disk_cache *)_c; + char hexp[HEXP_LEN] = {0}; + + cache_del(c->mem, key); + + key_path(c, key, hexp); + + return (unlinkat(c->dirfd, hexp, 0) == 0); +} + +static cache_value_t disk_get(cache *_c, const cache_key_t key) { + disk_cache *c = (disk_cache *)_c; + cache_key_t k; + cache_value_t v; + + v = cache_get(c->mem, key); + if (v != NULL) + return v; + + if (find_entry(c, key, &k, &v)) { + if (cache_add(c->mem, k, v)) return NULL; + return v; + } + return NULL; +} + +static void disk_destroy(cache *_c) { + disk_cache *c = (disk_cache *)_c; + cache_destroy(c->mem); + close(c->dirfd); +} + +cache *cache_disk(const char *dirpath, cache *mem, + kwrite_fn kwrite, vwrite_fn vwrite, + kread_fn kread, vread_fn vread) { + struct stat st; + disk_cache *res; + + mkdir(dirpath, 0777); /* This may fail, but we don't care */ + if (lstat(dirpath, &st) != 0) + return NULL; + if (!(st.st_mode & S_IFDIR)) + return NULL; + + res = calloc(sizeof(*res), 1); + if (res == NULL) return NULL; + + res->dirfd = open(dirpath, O_RDWR|O_CLOEXEC); + if (res->dirfd == -1) { + free(res); + return NULL; + } + + res->mem = mem; + res->kwrite = kwrite; + res->vwrite = vwrite; + res->kread = kread; + res->vread = vread; + res->c.add = disk_add; + res->c.del = disk_del; + res->c.get = disk_get; + res->c.destroy = disk_destroy; + res->c.keq = mem->keq; + res->c.khash = mem->khash; + res->c.kfree = mem->kfree; + res->c.vfree = mem->vfree; + return (cache *)res; +} From 02b97e7469d6dfd47765414c062ca507e9af6005 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 9 Dec 2016 22:07:18 +0100 Subject: [PATCH 216/597] Cleanup tool. --- bin/gpuarray-cache | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 bin/gpuarray-cache diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache new file mode 100644 index 0000000000..528e3eb9a5 --- /dev/null +++ b/bin/gpuarray-cache @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import os + +def clean(max_size): + content = [] + for root, dirs, files in os.walk(os.environ.get('GPUARRAY_CACHE', + '~/.gpuarray/cache/')): + for file in files: + fpath = os.path.join(root, file) + st = os.stat(fpath) + content.append((st.st_atime, st.st_size, fpath)) + + content.sort() + cur_size = 0 + for _, size, path in content: + cur_size += size + if cur_size > max_size: + os.remove(path) + + +SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 < 20, 'G': 1 << 30, 'T': 1 << 40, + 'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80} + + +def get_size(s): + i = 0 + while i < len(s) and (s[i].isdigit() or s[i] == '.'): + i += 1 + num = s[:i] + suf = s[i:] + num = float(num) + if suf != "": + letter = suf.strip().upper() + if letter not in SUFFIXES: + raise ValueError("can't interpret %r" % init) + mult = SUFFIXES[letter] + else: + mult = 0 + return int(num * mult) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility') + parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning') + args = parser.parse_args() + + clean(get_size(args.max_size)) + From d58675a195b4e752cb47a91b331cdaf13f4131dd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Sat, 10 Dec 2016 09:06:20 +0100 Subject: [PATCH 217/597] Expose the definition of cache_disk(). --- src/cache.h | 10 ++++++++++ src/cache/disk.c | 6 ------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/cache.h b/src/cache.h index f2059e73cc..f2e610f3dd 100644 --- a/src/cache.h +++ b/src/cache.h @@ -4,6 +4,7 @@ #include #include #include "private_config.h" +#include "util/strb.h" typedef void *cache_key_t; typedef void *cache_value_t; @@ -13,6 +14,11 @@ typedef uint32_t (*cache_hash_fn)(cache_key_t); typedef void (*cache_freek_fn)(cache_key_t); typedef void (*cache_freev_fn)(cache_value_t); +typedef int (*kwrite_fn)(strb *res, cache_key_t key); +typedef int (*vwrite_fn)(strb *res, cache_value_t val); +typedef cache_key_t (*kread_fn)(const strb *b); +typedef cache_value_t (*vread_fn)(const strb *b); + typedef struct _cache cache; struct _cache { @@ -78,6 +84,10 @@ cache *cache_twoq(size_t hot_size, size_t warm_size, cache_eq_fn keq, cache_hash_fn khash, cache_freek_fn kfree, cache_freev_fn vfree); +cache *cache_disk(const char *dirpath, cache *mem, + kwrite_fn kwrite, vwrite_fn vwrite, + kread_fn kread, vread_fn vread); + /* API functions */ static inline int cache_add(cache *c, cache_key_t k, cache_value_t v) { return c->add(c, k, v); diff --git a/src/cache/disk.c b/src/cache/disk.c index 59e6f1ba42..2cf90d7c77 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -8,16 +8,10 @@ #include "cache.h" #include "private_config.h" -#include "util/strb.h" #include "util/skein.h" #define HEXP_LEN (128 + 2) -typedef int (*kwrite_fn)(strb *res, cache_key_t key); -typedef int (*vwrite_fn)(strb *res, cache_value_t val); -typedef cache_key_t (*kread_fn)(const strb *b); -typedef cache_value_t (*vread_fn)(const strb *b); - typedef struct _disk_cache { cache c; cache * mem; From 605d58c8d9eb740bba6c80653c09abbd842e2ba7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 17 Jan 2017 19:27:17 -0500 Subject: [PATCH 218/597] Add the functions and setup for disk cache (nothing uses it yet). --- src/gpuarray_buffer_cuda.c | 99 +++++++++++++++++++++++++++++++++++--- src/private_cuda.h | 1 + src/util/strb.h | 2 +- 3 files changed, 95 insertions(+), 7 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 1c883deaa7..cf34d0288d 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -49,18 +49,72 @@ static int cuda_records(gpudata *, int, CUstream); static int detect_arch(const char *prefix, char *ret, CUresult *err); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); -static int strb_eq(void *_k1, void *_k2) { - strb *k1 = (strb *)_k1; - strb *k2 = (strb *)_k2; +typedef struct _kernel_key { + char bin_id[64]; + strb *src; +} kernel_key; + +static void key_free(cache_key_t _k) { + kernel_key *k = (kernel_key *)_k; + strb_free(k->src); + free(k); +} + +static int strb_eq(strb *k1, strb *k2) { return (k1->l == k2->l && memcmp(k1->s, k2->s, k1->l) == 0); } -static uint32_t strb_hash(void *_k) { - strb *k = (strb *)_k; +static uint32_t strb_hash(strb *k) { return XXH32(k->s, k->l, 42); } +static int key_eq(kernel_key *k1, kernel_key *k2) { + return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 && + strb_eq(k1->src, k2->src)); +} + +static int key_hash(kernel_key *k) { + XXH32_state_t state; + XXH32_reset(&state, 42); + XXH32_update(&state, k->bin_id, 64); + XXH32_update(&state, k->src->s, k->src->l); + return XXH32_digest(&state); +} + +static int key_write(strb *res, kernel_key *k) { + strb_appendn(res, k->bin_id, 64); + strb_appendb(res, k->src); + return strb_error(res); +} + +static kernel_key *key_read(const strb *b) { + kernel_key *k; + if (b->l < 64) return NULL; + k = malloc(sizeof(*k)); + if (k == NULL) return NULL; + k->src = strb_alloc(b->l - 64); + if (k->src == NULL) { + free(k); + return NULL; + } + memcpy(k->bin_id, b->s, 64); + strb_appendn(k->src, b->s+64, b->l-64); + return k; +} + +static int kernel_write(strb *res, strb *bin) { + strb_appendb(res, bin); + return strb_error(res); +} + +static strb *kernel_read(const strb *b) { + strb *res = strb_alloc(b->l); + if (res != NULL) + strb_appendb(res, b); + return res; +} + static int setup_done = 0; static int major = -1; static int minor = -1; @@ -114,6 +168,8 @@ static int cuda_get_device_count(unsigned int platform, cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; + cache *mem_cache; + char *cache_path; void *p; int e; @@ -152,11 +208,38 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { goto fail_mem_stream; } } - res->kernel_cache = cache_twoq(64, 128, 64, 8, strb_eq, strb_hash, + + res->kernel_cache = cache_twoq(64, 128, 64, 8, + (cache_eq_fn)strb_eq, + (cache_hash_fn)strb_hash, (cache_freek_fn)strb_free, (cache_freev_fn)cuda_freekernel); if (res->kernel_cache == NULL) goto fail_cache; + + cache_path = getenv("GPUARRAY_CACHE_PATH"); + if (cache_path != NULL) { + mem_cache = cache_lru(64, 8, + (cache_eq_fn)key_eq, + (cache_hash_fn)key_hash, + (cache_freek_fn)key_free, + (cache_freev_fn)strb_free); + if (mem_cache == NULL) + goto fail_disk_cache; + res->disk_cache = cache_disk(cache_path, mem_cache, + (kwrite_fn)key_write, + (vwrite_fn)kernel_write, + (kread_fn)key_read, + (vread_fn)kernel_read); + if (res->disk_cache == NULL) { + cache_destroy(mem_cache); + goto fail_disk_cache; + } + } else { + fail_disk_cache: + res->disk_cache = NULL; + } + err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { goto fail_errbuf; @@ -174,6 +257,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { fail_end: cuMemFreeHost(p); fail_errbuf: + if (res->disk_cache) + cache_destroy(res->disk_cache); cache_destroy(res->kernel_cache); fail_cache: if (ISCLR(res->flags, GA_CTX_SINGLE_STREAM)) @@ -215,6 +300,8 @@ static void cuda_free_ctx(cuda_context *ctx) { deallocate(curr); } cache_destroy(ctx->kernel_cache); + if (ctx->disk_cache) + cache_destroy(ctx->disk_cache); if (!(ctx->flags & DONTFREE)) { cuCtxPushCurrent(ctx->ctx); diff --git a/src/private_cuda.h b/src/private_cuda.h index da6f60ad7a..ad9ff7f8ae 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -68,6 +68,7 @@ typedef struct _cuda_context { CUstream mem_s; gpudata *freeblocks; cache *kernel_cache; + cache *disk_cache; unsigned int enter; unsigned char major; unsigned char minor; diff --git a/src/util/strb.h b/src/util/strb.h index 267941417e..01ea7a2495 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -146,7 +146,7 @@ static inline void strb_appends(strb *sb, const char *s) { /* * Appends the content of another strb. */ -static inline void strb_appendb(strb *sb, strb *sb2) { +static inline void strb_appendb(strb *sb, const strb *sb2) { strb_appendn(sb, sb2->s, sb2->l); } From 6f1fd645a3c1197c3b5cfc3849ed0b5a695596b4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 18 Jan 2017 19:49:00 -0500 Subject: [PATCH 219/597] Rework the compile logic and integrate the disk cache into the mix. This removes the gpukernel_binary support also, because it became burdensome and redundant. If the kernels are already cached at the library level, there is no need for applications to do the same. --- src/gpuarray/buffer.h | 17 +- src/gpuarray/error.h | 1 + src/gpuarray/kernel.h | 1 + src/gpuarray_buffer.c | 2 +- src/gpuarray_buffer_cuda.c | 375 +++++++++++++++++------------------ src/gpuarray_buffer_opencl.c | 29 --- src/gpuarray_error.c | 1 + src/loaders/libcuda.fn | 4 + src/loaders/libcuda.h | 11 + src/private.h | 1 - src/private_cuda.h | 2 - 11 files changed, 205 insertions(+), 239 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 800756a072..34878d503a 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -494,22 +494,9 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, size_t shared, void **args); /** - * (Deprecated) Get the kernel binary. + * Get the kernel binary (REMOVED). * - * This function is deprecated and will be removed in the next release. - * - * This can be use to cache kernel binaries after compilation of a - * specific device. The kernel can be recreated by calling - * kernel_alloc with the binary and size and passing `GA_USE_BINARY` - * as the use flags. - * - * The returned pointer is allocated and must be freed by the caller. - * - * \param k kernel - * \param sz size of the returned binary - * \param obj pointer to the binary for the kernel. - * - * \returns GA_NO_ERROR or an error code if an error occurred. + * Always returns GA_DEPRECATED_ERROR. */ GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h index af963c1531..84c852a257 100644 --- a/src/gpuarray/error.h +++ b/src/gpuarray/error.h @@ -36,6 +36,7 @@ enum ga_error { GA_COMM_ERROR, GA_XLARGE_ERROR, GA_LOAD_ERROR, + GA_DEPRECATED_ERROR, /* Add more error types if needed, but at the end */ /* Don't forget to sync with Gpu_error() */ }; diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index f88d74ffc6..da779123b9 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -107,6 +107,7 @@ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); +/* Deprecated and to be removed */ GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index a4dfd3329b..d3226ea94e 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -187,7 +187,7 @@ int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, } int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) { - return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj); + return GA_DEPRECATED_ERROR; } int gpukernel_property(gpukernel *k, int prop_id, void *res) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index cf34d0288d..3820dff023 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -51,12 +51,12 @@ static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); typedef struct _kernel_key { char bin_id[64]; - strb *src; + strb src; } kernel_key; static void key_free(cache_key_t _k) { kernel_key *k = (kernel_key *)_k; - strb_free(k->src); + strb_clear(&k->src); free(k); } @@ -71,35 +71,35 @@ static uint32_t strb_hash(strb *k) { static int key_eq(kernel_key *k1, kernel_key *k2) { return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 && - strb_eq(k1->src, k2->src)); + strb_eq(&k1->src, &k2->src)); } static int key_hash(kernel_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); XXH32_update(&state, k->bin_id, 64); - XXH32_update(&state, k->src->s, k->src->l); + XXH32_update(&state, k->src.s, k->src.l); return XXH32_digest(&state); } static int key_write(strb *res, kernel_key *k) { strb_appendn(res, k->bin_id, 64); - strb_appendb(res, k->src); + strb_appendb(res, &k->src); return strb_error(res); } static kernel_key *key_read(const strb *b) { kernel_key *k; if (b->l < 64) return NULL; - k = malloc(sizeof(*k)); + k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; - k->src = strb_alloc(b->l - 64); - if (k->src == NULL) { + if (strb_ensure(&k->src, b->l - 64) != 0) { + strb_clear(&k->src); free(k); return NULL; } memcpy(k->bin_id, b->s, 64); - strb_appendn(k->src, b->s+64, b->l-64); + strb_appendn(&k->src, b->s+64, b->l-64); return k; } @@ -1000,22 +1000,22 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err) { return GA_NO_ERROR; } -static void *call_compiler(const char *src, size_t len, const char *arch_arg, - size_t *bin_len, char **log, size_t *log_len, - int *ret) { +static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { nvrtcProgram prog; - void *buf = NULL; size_t buflen; const char *opts[4] = { "-arch", "" , "-G", "-lineinfo" }; - nvrtcResult err, err2; + nvrtcResult err; - opts[1] = arch_arg; + opts[1] = ctx->bin_id; - err = nvrtcCreateProgram(&prog, src, NULL, 0, NULL, NULL); - if (err != NVRTC_SUCCESS) FAIL(NULL, GA_SYS_ERROR); + strb_append0(src); + if (strb_error(src)) + return GA_MEMORY_ERROR; + err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL); + if (err != NVRTC_SUCCESS) return GA_SYS_ERROR; err = nvrtcCompileProgram(prog, #ifdef DEBUG @@ -1024,41 +1024,115 @@ static void *call_compiler(const char *src, size_t len, const char *arch_arg, 2, #endif opts); - if (log != NULL) { - err2 = nvrtcGetProgramLogSize(prog, &buflen); - if (err2 != NVRTC_SUCCESS) goto end2; - buf = malloc(buflen); - if (buf == NULL) goto end2; - err2 = nvrtcGetProgramLog(prog, (char *)buf); - if (err2 != NVRTC_SUCCESS) goto end2; - if (log_len != NULL) *log_len = buflen; - *log = (char *)buf; - buf = NULL; + if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) { + strb_appends(log, "NVRTC compile log::\n"); + if (strb_ensure(log, buflen) == 0) + if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS) + log->l += buflen - 1; + strb_appendc(log, '\n'); } -end2: - if (err != NVRTC_SUCCESS) goto end; err = nvrtcGetPTXSize(prog, &buflen); if (err != NVRTC_SUCCESS) goto end; - buf = malloc(buflen); - if (buf == NULL) { - nvrtcDestroyProgram(&prog); - FAIL(NULL, GA_MEMORY_ERROR); + if (strb_ensure(ptx, buflen) == 0) + err = nvrtcGetPTX(prog, ptx->s+ptx->l); + +end: + nvrtcDestroyProgram(&prog); + if (err != NVRTC_SUCCESS) + return GA_SYS_ERROR; + return GA_NO_ERROR; +} + +static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { + char info_log[2048]; + char error_log[2048]; + void *out; + size_t out_size; + CUlinkState st; + CUjit_option cujit_opts[] = { + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_LOG_VERBOSE, + CU_JIT_GENERATE_DEBUG_INFO, + CU_JIT_GENERATE_LINE_INFO, + }; + void *cujit_opt_vals[] = { + (void *)sizeof(info_log), info_log, + (void *)sizeof(error_log), error_log, +#ifdef DEBUG + (void *)1, (void *)1, (void *)1 +#else + (void *)0, (void *)0, (void *)0 +#endif + }; + + ctx->err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), + cujit_opts, cujit_opt_vals, &st); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, + "kernel code", 0, NULL, NULL); + if (ctx->err != CUDA_SUCCESS) { + cuLinkDestroy(st); + return GA_IMPL_ERROR; + } + ctx->err = cuLinkComplete(st, &out, &out_size); + if (ctx->err != CUDA_SUCCESS) { + cuLinkDestroy(st); + return GA_IMPL_ERROR; } + strb_appendn(bin, out, out_size); + cuLinkDestroy(st); + strb_appends(log, "Link info log::\n"); + strb_appends(log, info_log); + strb_appends(log, "\nLink error log::\n"); + strb_appends(log, error_log); + strb_appendc(log, '\n'); + return GA_NO_ERROR; +} - err = nvrtcGetPTX(prog, (char *)buf); - if (err != NVRTC_SUCCESS) goto end; +static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { + strb ptx = STRB_STATIC_INIT; + strb *cbin; + kernel_key k; + kernel_key *pk; + int err; - *bin_len = buflen; + memcpy(k.bin_id, ctx->bin_id, 64); + memcpy(&k.src, src, sizeof(strb)); -end: - nvrtcDestroyProgram(&prog); - if (err != NVRTC_SUCCESS) { - free(buf); - FAIL(NULL, GA_SYS_ERROR); + // Look up the binary in the disk cache + cbin = cache_get(ctx->disk_cache, &k); + if (cbin != NULL) { + strb_appendb(bin, cbin); + return GA_NO_ERROR; + } + + err = call_compiler(ctx, src, &ptx, log); + if (err != GA_NO_ERROR) return err; + err = make_bin(ctx, &ptx, bin, log); + if (err != GA_NO_ERROR) return err; + pk = memdup(&k, sizeof(k)); + if (pk == NULL) + return err; + cbin = strb_alloc(bin->l); + if (cbin == NULL) { + free(pk); + return err; } - return buf; + strb_appendb(cbin, bin); + if (strb_error(cbin)) { + free(pk); + strb_free(cbin); + return err; + } + cache_add(ctx->disk_cache, pk, cbin); + + return err; } static void _cuda_freekernel(gpukernel *k) { @@ -1072,7 +1146,6 @@ static void _cuda_freekernel(gpukernel *k) { } CLEAR(k); free(k->args); - free(k->bin); free(k->types); free(k); } @@ -1084,45 +1157,21 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; - strb sb = STRB_STATIC_INIT; - strb *psb; - char *bin, *log = NULL; + strb src = STRB_STATIC_INIT; + strb bin = STRB_STATIC_INIT; + strb log = STRB_STATIC_INIT; + strb *psrc; gpukernel *res; - size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int major, minor; - strb debug_msg = STRB_STATIC_INIT; - - // options for cuModuleLoadDataEx - const size_t cujit_log_size = 4096; - char *cujit_info_log = NULL; - unsigned int num_cujit_opts = 4; - CUjit_option cujit_opts[] = { - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_INFO_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER - }; - void *cujit_opt_vals[] = { - (void*)(size_t)cujit_log_size, NULL, - (void*)(size_t)cujit_log_size, NULL, - }; + int err; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); - if (flags & GA_USE_BINARY) { - // GA_USE_BINARY is exclusive - if (flags & ~GA_USE_BINARY) - FAIL(NULL, GA_INVALID_ERROR); - // We need the length for binary data and there is only one blob. - if (count != 1 || lengths == NULL || lengths[0] == 0) - FAIL(NULL, GA_VALUE_ERROR); - } - cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); @@ -1138,6 +1187,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, // GA_USE_CLUDA is done later // GA_USE_SMALL will always work + // GA_USE_HALF should always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); @@ -1149,90 +1199,73 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } - // GA_USE_HALF should always work - if (flags & GA_USE_BINARY) { - bin = memdup(strings[0], lengths[0]); - bin_len = lengths[0]; - if (bin == NULL) { - cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); - } + if (flags & GA_USE_CLUDA) { + strb_appends(&src, CUDA_PREAMBLE); + } + + if (lengths == NULL) { + for (i = 0; i < count; i++) + strb_appends(&src, strings[i]); } else { - if (flags & GA_USE_CLUDA) { - strb_appends(&sb, CUDA_PREAMBLE); + for (i = 0; i < count; i++) { + if (lengths[i] == 0) + strb_appends(&src, strings[i]); + else + strb_appendn(&src, strings[i], lengths[i]); } + } - if (lengths == NULL) { - for (i = 0; i < count; i++) - strb_appends(&sb, strings[i]); - } else { - for (i = 0; i < count; i++) { - if (lengths[i] == 0) - strb_appends(&sb, strings[i]); - else - strb_appendn(&sb, strings[i], lengths[i]); - } - } + strb_append0(&src); - strb_append0(&sb); + if (strb_error(&src)) { + strb_clear(&src); + cuda_exit(ctx); + FAIL(NULL, GA_MEMORY_ERROR); + } - if (strb_error(&sb)) { - strb_clear(&sb); - cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); - } + res = (gpukernel *)cache_get(ctx->kernel_cache, &src); + if (res != NULL) { + res->refcnt++; + strb_clear(&src); + return res; + } - res = (gpukernel *)cache_get(ctx->kernel_cache, &sb); - if (res != NULL) { - res->refcnt++; - strb_clear(&sb); - return res; - } - bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, - &log, &log_len, ret); - if (bin == NULL) { - if (err_str != NULL) { - - // We're substituting debug_msg for a string with this first line: - strb_appends(&debug_msg, "CUDA kernel compile failure ::\n"); - - /* Delete the final NUL */ - sb.l--; - gpukernel_source_with_line_numbers(1, (const char **)&sb.s, - &sb.l, &debug_msg); - - if (log != NULL) { - strb_appends(&debug_msg, "\nCompiler log:\n"); - strb_appendn(&debug_msg, log, log_len); - free(log); - } - *err_str = strb_cstr(&debug_msg); - // *err_str will be free()d by the caller (see docs in kernel.h) - } - strb_clear(&sb); - cuda_exit(ctx); - FAIL(NULL, GA_IMPL_ERROR); + err = compile(ctx, &src, &bin, &log); + if (err != GA_NO_ERROR || strb_error(&bin)) { + if (err_str != NULL) { + strb debug_msg = STRB_STATIC_INIT; + strb_appends(&debug_msg, "CUDA kernel compile failure ::\n"); + src.l--; + gpukernel_source_with_line_numbers(1, (const char **)&src.s, + &src.l, &debug_msg); + strb_appends(&debug_msg, "\nCompile log:\n"); + strb_appendb(&debug_msg, &log); + *err_str = strb_cstr(&debug_msg); } + strb_clear(&src); + strb_clear(&bin); + strb_clear(&log); + cuda_exit(ctx); + FAIL(NULL, err); } + strb_clear(&log); res = calloc(1, sizeof(*res)); if (res == NULL) { - free(bin); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } - res->bin_sz = bin_len; - res->bin = bin; - res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } @@ -1240,55 +1273,26 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } - // for both info/err log - cujit_info_log = (char*)malloc(2*cujit_log_size*sizeof(char)); - if(cujit_info_log == NULL) { - _cuda_freekernel(res); - strb_clear(&sb); - cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); - } - cujit_info_log[0] = 0; - cujit_info_log[cujit_log_size] = 0; - cujit_opt_vals[1] = (void*)cujit_info_log; - cujit_opt_vals[3] = (void*)(cujit_info_log+cujit_log_size); - - ctx->err = cuModuleLoadDataEx( - &res->m, bin, - num_cujit_opts, cujit_opts, (void**)cujit_opt_vals); - + ctx->err = cuModuleLoadData(&res->m, bin.s); if (ctx->err != CUDA_SUCCESS) { - if (err_str != NULL) { - strb_appends(&debug_msg, "CUDA kernel link failure::\n"); - if (cujit_info_log[0]) { - strb_appends(&debug_msg, "\nLinker msg:\n"); - strb_appends(&debug_msg, cujit_info_log); - } - if (cujit_info_log[cujit_log_size]) { - strb_appends(&debug_msg, "\nLinker error log:\n"); - strb_appends(&debug_msg, cujit_info_log+cujit_log_size); - } - strb_append0(&debug_msg); - *err_str = strb_cstr(&debug_msg); - } - free(cujit_info_log); _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } - - free(cujit_info_log); + strb_clear(&bin); ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } @@ -1297,16 +1301,16 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); - psb = memdup(&sb, sizeof(strb)); - if (psb == NULL) { - cuda_freekernel(res); - strb_clear(&sb); - FAIL(NULL, GA_MEMORY_ERROR); + psrc = memdup(&src, sizeof(strb)); + if (psrc != NULL) { + /* One of the refs is for the cache */ + res->refcnt++; + /* If this fails, it will free the key and remove a ref from the + kernel. */ + cache_add(ctx->kernel_cache, psrc, res); + } else { + strb_clear(&src); } - /* One of the refs is for the cache */ - res->refcnt++; - /* If this fails, it will free the key and remove a ref from the kernel. */ - cache_add(ctx->kernel_cache, psb, res); return res; } @@ -1381,16 +1385,6 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } -static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) { - void *res = malloc(k->bin_sz); - if (res == NULL) - return GA_MEMORY_ERROR; - memcpy(res, k->bin, k->bin_sz); - *sz = k->bin_sz; - *obj = res; - return GA_NO_ERROR; -} - static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; @@ -1793,7 +1787,6 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_freekernel, cuda_kernelsetarg, cuda_callkernel, - cuda_kernelbin, cuda_sync, cuda_transfer, cuda_property, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 84bcb6584d..89b56f80a7 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1076,34 +1076,6 @@ static int cl_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } -static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) { - cl_ctx *ctx = k->ctx; - cl_program p; - size_t rsz; - void *res; - - ASSERT_KER(k); - ASSERT_CTX(ctx); - - ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - res = malloc(rsz); - if (res == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); - if (ctx->err != CL_SUCCESS) { - free(res); - return GA_IMPL_ERROR; - } - *sz = rsz; - *obj = res; - return GA_NO_ERROR; -} - static int cl_sync(gpudata *b) { cl_ctx *ctx = (cl_ctx *)b->ctx; @@ -1465,7 +1437,6 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_releasekernel, cl_setkernelarg, cl_callkernel, - cl_kernelbin, cl_sync, cl_transfer, cl_property, diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c index b7d5011f5b..ddebd3e9dc 100644 --- a/src/gpuarray_error.c +++ b/src/gpuarray_error.c @@ -25,6 +25,7 @@ const char *gpuarray_error_str(int err) { case GA_COMM_ERROR: return "Error in collectives call"; case GA_XLARGE_ERROR: return "Input size too large for operation"; case GA_LOAD_ERROR: return "Error loading library"; + case GA_DEPRECATED_ERROR: return "Deprecated (removed) functionality"; default: return "Unknown GA error"; } } diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn index 487706f4f3..5bfc890fba 100644 --- a/src/loaders/libcuda.fn +++ b/src/loaders/libcuda.fn @@ -17,6 +17,10 @@ DEF_PROC(cuCtxGetDevice, (CUdevice *device)); DEF_PROC_V2(cuCtxPushCurrent, (CUcontext ctx)); DEF_PROC_V2(cuCtxPopCurrent, (CUcontext *pctx)); +DEF_PROC(cuLinkCreate, (unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)); +DEF_PROC(cuLinkAddData, (CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)); +DEF_PROC(cuLinkComplete, (CUlinkState state, void **cubinOut, size_t *sizeOut)); +DEF_PROC(cuLinkDestroy, (CUlinkState state)); DEF_PROC(cuModuleLoadData, (CUmodule *module, const void *image)); DEF_PROC(cuModuleLoadDataEx, (CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)); DEF_PROC(cuModuleUnload, (CUmodule hmod)); diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h index e62f8b85df..3a6bf35a8a 100644 --- a/src/loaders/libcuda.h +++ b/src/loaders/libcuda.h @@ -23,6 +23,7 @@ typedef struct CUmod_st *CUmodule; typedef struct CUfunc_st *CUfunction; typedef struct CUevent_st *CUevent; typedef struct CUstream_st *CUstream; +typedef struct CUlinkState_st *CUlinkState; typedef enum CUdevice_attribute_enum CUdevice_attribute; typedef enum CUfunction_attribute_enum CUfunction_attribute; @@ -30,6 +31,7 @@ typedef enum CUevent_flags_enum CUevent_flags; typedef enum CUctx_flags_enum CUctx_flags; typedef enum CUipcMem_flags_enum CUipcMem_flags; typedef enum CUjit_option_enum CUjit_option; +typedef enum CUjitInputType_enum CUjitInputType; #define CU_IPC_HANDLE_SIZE 64 @@ -206,4 +208,13 @@ enum CUjit_option_enum { CU_JIT_NUM_OPTIONS }; +enum CUjitInputType_enum { + CU_JIT_INPUT_CUBIN = 0, + CU_JIT_INPUT_PTX, + CU_JIT_INPUT_FATBINARY, + CU_JIT_INPUT_OBJECT, + CU_JIT_INPUT_LIBRARY, + CU_JIT_NUM_INPUT_TYPES +}; + #endif diff --git a/src/private.h b/src/private.h index 7405cf8995..abe9783de7 100644 --- a/src/private.h +++ b/src/private.h @@ -100,7 +100,6 @@ struct _gpuarray_buffer_ops { const size_t *gs, const size_t *ls, size_t shared, void **args); - int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj); int (*buffer_sync)(gpudata *b); int (*buffer_transfer)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); diff --git a/src/private_cuda.h b/src/private_cuda.h index ad9ff7f8ae..a0b4557977 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -137,8 +137,6 @@ struct _gpukernel { CUmodule m; CUfunction k; void **args; - size_t bin_sz; - void *bin; int *types; unsigned int argcount; unsigned int refcnt; From 0cd407243475e4a13821c07ea216136fce270256 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 25 Jan 2017 15:54:24 -0500 Subject: [PATCH 220/597] Fix some problems with the disk cache. --- src/gpuarray_buffer_cuda.c | 61 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 3820dff023..7b1f540c26 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1011,9 +1011,6 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { opts[1] = ctx->bin_id; - strb_append0(src); - if (strb_error(src)) - return GA_MEMORY_ERROR; err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL); if (err != NVRTC_SUCCESS) return GA_SYS_ERROR; @@ -1035,8 +1032,10 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { err = nvrtcGetPTXSize(prog, &buflen); if (err != NVRTC_SUCCESS) goto end; - if (strb_ensure(ptx, buflen) == 0) + if (strb_ensure(ptx, buflen) == 0) { err = nvrtcGetPTX(prog, ptx->s+ptx->l); + if (err == NVRTC_SUCCESS) ptx->l = buflen; + } end: nvrtcDestroyProgram(&prog); @@ -1069,6 +1068,7 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { (void *)0, (void *)0, (void *)0 #endif }; + int err = GA_NO_ERROR; ctx->err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), cujit_opts, cujit_opt_vals, &st); @@ -1077,22 +1077,23 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, "kernel code", 0, NULL, NULL); if (ctx->err != CUDA_SUCCESS) { - cuLinkDestroy(st); - return GA_IMPL_ERROR; + err = GA_IMPL_ERROR; + goto out; } ctx->err = cuLinkComplete(st, &out, &out_size); if (ctx->err != CUDA_SUCCESS) { - cuLinkDestroy(st); - return GA_IMPL_ERROR; + err = GA_IMPL_ERROR; + goto out; } strb_appendn(bin, out, out_size); +out: cuLinkDestroy(st); strb_appends(log, "Link info log::\n"); strb_appends(log, info_log); strb_appends(log, "\nLink error log::\n"); strb_appends(log, error_log); strb_appendc(log, '\n'); - return GA_NO_ERROR; + return err; } static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { @@ -1106,33 +1107,37 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { memcpy(&k.src, src, sizeof(strb)); // Look up the binary in the disk cache - cbin = cache_get(ctx->disk_cache, &k); - if (cbin != NULL) { - strb_appendb(bin, cbin); - return GA_NO_ERROR; + if (ctx->disk_cache) { + cbin = cache_get(ctx->disk_cache, &k); + if (cbin != NULL) { + strb_appendb(bin, cbin); + return GA_NO_ERROR; + } } err = call_compiler(ctx, src, &ptx, log); if (err != GA_NO_ERROR) return err; err = make_bin(ctx, &ptx, bin, log); if (err != GA_NO_ERROR) return err; - pk = memdup(&k, sizeof(k)); - if (pk == NULL) - return err; - cbin = strb_alloc(bin->l); - if (cbin == NULL) { - free(pk); - return err; - } - strb_appendb(cbin, bin); - if (strb_error(cbin)) { - free(pk); - strb_free(cbin); - return err; + if (ctx->disk_cache) { + pk = memdup(&k, sizeof(k)); + if (pk == NULL) + return GA_NO_ERROR; + cbin = strb_alloc(bin->l); + if (cbin == NULL) { + free(pk); + return GA_NO_ERROR; + } + strb_appendb(cbin, bin); + if (strb_error(cbin)) { + free(pk); + strb_free(cbin); + return GA_NO_ERROR; + } + cache_add(ctx->disk_cache, pk, cbin); } - cache_add(ctx->disk_cache, pk, cbin); - return err; + return GA_NO_ERROR; } static void _cuda_freekernel(gpukernel *k) { From f1e3c2701727221488be47bbce06e0569e8c1e1d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 25 Jan 2017 16:49:25 -0500 Subject: [PATCH 221/597] Fix some directory creation problems in the disk_cache code. --- src/cache/disk.c | 51 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 2cf90d7c77..0a7a5ab18d 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -81,6 +81,27 @@ static int mkstempat(int dfd, char *template) { return -1; } +/* Ensure that a path exists by creating all intermediate directories */ +static int ensureat(int dfd, char *path) { + char *curp; + char *pos; + + curp = path; + + while ((pos = strchr(curp, '/')) != NULL) { + *pos = '\0'; + if (mkdirat(dfd, path, 0777)) { + if (errno != EEXIST) return -1; + /* For now we suppose that EEXIST means that the directory is + * already there.*/ + } + curp = pos + 1; + *pos = '/'; + } + + return 0; +} + static int key_path(disk_cache *c, const cache_key_t key, char *out) { strb kb = STRB_STATIC_INIT; unsigned char hash[64]; @@ -88,8 +109,8 @@ static int key_path(disk_cache *c, const cache_key_t key, char *out) { if (c->kwrite(&kb, key)) return -1; if (Skein_512((unsigned char *)kb.s, kb.l, hash)) return -1; - if (snprintf(out, 6, "%02x%02x/%02x%02x", - hash[0], hash[1], hash[2], hash[3]) != 5) + if (snprintf(out, 10, "%02x%02x/%02x%02x", + hash[0], hash[1], hash[2], hash[3]) != 9) return -1; for (i = 4; i < 64; i += 4) { if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x", @@ -109,7 +130,9 @@ static int write_entry(disk_cache *c, const cache_key_t k, if (key_path(c, k, hexp)) return -1; - if (!strb_ensure(&b, 16)) return -1; + if (ensureat(c->dirfd, hexp)) return -1; + + if (strb_ensure(&b, 16)) return -1; b.l = 16; c->kwrite(&b, k); kl = b.l - 16; @@ -135,7 +158,7 @@ static int write_entry(disk_cache *c, const cache_key_t k, unlinkat(c->dirfd, tmp_path, 0); return -1; } - + if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) { unlinkat(c->dirfd, tmp_path, 0); return -1; @@ -227,7 +250,7 @@ static int disk_add(cache *_c, cache_key_t k, cache_value_t v) { static int disk_del(cache *_c, const cache_key_t key) { disk_cache *c = (disk_cache *)_c; char hexp[HEXP_LEN] = {0}; - + cache_del(c->mem, key); key_path(c, key, hexp); @@ -262,17 +285,29 @@ cache *cache_disk(const char *dirpath, cache *mem, kread_fn kread, vread_fn vread) { struct stat st; disk_cache *res; + char *dirp = strdup(dirpath); + + if (dirp == NULL) return NULL; + + if (ensureat(AT_FDCWD, dirp) != 0) { + free(dirp); + return NULL; + } + free(dirp); + + mkdir(dirpath, 0777); /* This may fail, but it's ok */ - mkdir(dirpath, 0777); /* This may fail, but we don't care */ if (lstat(dirpath, &st) != 0) return NULL; + if (!(st.st_mode & S_IFDIR)) return NULL; res = calloc(sizeof(*res), 1); - if (res == NULL) return NULL; + if (res == NULL) + return NULL; - res->dirfd = open(dirpath, O_RDWR|O_CLOEXEC); + res->dirfd = open(dirpath, O_RDONLY|O_CLOEXEC); if (res->dirfd == -1) { free(res); return NULL; From 24dd80d621be81a6dbe2be976b38bf4f0b6f20eb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 30 Jan 2017 13:38:24 -0500 Subject: [PATCH 222/597] Fix the cache cleanup script. --- bin/gpuarray-cache | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache index 528e3eb9a5..04b7e8e68a 100644 --- a/bin/gpuarray-cache +++ b/bin/gpuarray-cache @@ -19,7 +19,7 @@ def clean(max_size): os.remove(path) -SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 < 20, 'G': 1 << 30, 'T': 1 << 40, +SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40, 'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80} @@ -44,7 +44,7 @@ if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility') - parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning') + parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)') args = parser.parse_args() clean(get_size(args.max_size)) From 5f020fc1f8aa16bd9e1b084c8a397e7013847296 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 10 Feb 2017 12:46:24 -0500 Subject: [PATCH 223/597] Fix a type punning issue in the Skein code. --- src/util/skein.c | 16 ++++++++-------- src/util/skein.h | 5 ++++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/util/skein.c b/src/util/skein.c index 51362e5efb..38912e8320 100644 --- a/src/util/skein.c +++ b/src/util/skein.c @@ -247,13 +247,13 @@ int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ if (n) { Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); + memcpy(&ctx->bb.b[ctx->h.bCnt],msg,n); msgByteCnt -= n; msg += n; ctx->h.bCnt += n; } Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); - Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx,ctx->bb.b,1,SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; } /* now process any remaining full blocks, directly from input message data */ @@ -269,7 +269,7 @@ int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, /* copy any remaining source message data bytes into b[] */ if (msgByteCnt) { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->bb.b[ctx->h.bCnt],msg,msgByteCnt); ctx->h.bCnt += msgByteCnt; } @@ -285,20 +285,20 @@ int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) { ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + memset(&ctx->bb.b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); - Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein_512_Process_Block(ctx,ctx->bb.b,1,ctx->h.bCnt); /* process the final block */ /* now output the result */ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memset(ctx->bb.b,0,sizeof(ctx->bb.b)); /* zero out b[], so it can hold the counter */ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + ctx->bb.l[0] = Skein_Swap64((u64b_t) i); /* build the counter block */ Skein_Start_New_Type(ctx,OUT_FINAL); - Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + Skein_512_Process_Block(ctx,ctx->bb.b,1,sizeof(u64b_t)); /* run "counter mode" */ n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ if (n >= SKEIN_512_BLOCK_BYTES) n = SKEIN_512_BLOCK_BYTES; diff --git a/src/util/skein.h b/src/util/skein.h index 89d7ebf209..b505a51801 100644 --- a/src/util/skein.h +++ b/src/util/skein.h @@ -56,7 +56,10 @@ typedef struct { typedef struct { /* 512-bit Skein hash context structure */ Skein_Ctxt_Hdr_t h; /* common header context variables */ u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + union Skein_512_Ctxt_b_u { + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + u64b_t l[SKEIN_512_BLOCK_BYTES/8]; + } bb; } Skein_512_Ctxt_t; /* Skein APIs for (incremental) "straight hashing" */ From 4d014a8b255cc85324211bb8354fc44436db1aa0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 15:24:06 -0500 Subject: [PATCH 224/597] Switch away from ...at() functions since those don't exist on windows. Also try to make the code work for windows. --- CMakeLists.txt | 2 +- make.bat | 6 ++ src/cache/disk.c | 223 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 160 insertions(+), 71 deletions(-) create mode 100755 make.bat diff --git a/CMakeLists.txt b/CMakeLists.txt index 382c064e9c..09f7f1fd4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: # http://stackoverflow.com/q/4001736/3257826 if(MSVC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4 -D_CRT_SECURE_NO_WARNINGS") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") endif() diff --git a/make.bat b/make.bat new file mode 100755 index 0000000000..1ea7aa55cd --- /dev/null +++ b/make.bat @@ -0,0 +1,6 @@ +del bld +mkdir bld +cd bld +cmake .. -G "NMake Makefiles" +cmake --build . --config Release +cd .. diff --git a/src/cache/disk.c b/src/cache/disk.c index 0a7a5ab18d..2f4fc77efe 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -1,13 +1,57 @@ -#include #include #include -#include #include + +#include "private_config.h" + +#ifdef _WIN32 +#define PATH_MAX 255 + +#define WIN32_LEAN_AND_MEAN +#include + +#include +#include + +struct timezone; + +struct timeval { + long tv_sec; + long tv_usec; +} timeval; + +static int gettimeofday(struct timeval *tp, struct timezone *tzp) { + /* + * Note: some broken versions only have 8 trailing zero's, the + * correct epoch has 9 trailing zero's This magic number is the + * number of 100 nanosecond intervals since January 1, 1601 (UTC) + * until 00:00:00 January 1, 1970 + */ + static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL); + + SYSTEMTIME system_time; + FILETIME file_time; + uint64_t time; + + GetSystemTime(&system_time); + SystemTimeToFileTime(&system_time, &file_time); + time = ((uint64_t)file_time.dwLowDateTime); + time += ((uint64_t)file_time.dwHighDateTime) << 32; + + tp->tv_sec = (long)((time - EPOCH) / 10000000L); + tp->tv_usec = (long)(system_time.wMilliseconds * 1000); + return 0; +} + +#else +#define PATH_MAX 1024 +#include #include +#endif + #include #include "cache.h" -#include "private_config.h" #include "util/skein.h" #define HEXP_LEN (128 + 2) @@ -19,7 +63,7 @@ typedef struct _disk_cache { vwrite_fn vwrite; kread_fn kread; vread_fn vread; - int dirfd; + const char *dirp; } disk_cache; @@ -31,72 +75,112 @@ static unsigned long long ntohull(const char *in) { } static void htonull(unsigned long long in, char *out) { - out[0] = in >> 56; - out[1] = in >> 48; - out[2] = in >> 40; - out[3] = in >> 32; - out[4] = in >> 24; - out[5] = in >> 16; - out[6] = in >> 8; - out[7] = in; + out[0] = (char)(in >> 56); + out[1] = (char)(in >> 48); + out[2] = (char)(in >> 40); + out[3] = (char)(in >> 32); + out[4] = (char)(in >> 24); + out[5] = (char)(in >> 16); + out[6] = (char)(in >> 8); + out[7] = (char)(in); } -static int mkstempat(int dfd, char *template) { - static const char letters[] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; - size_t length; - char *XXXXXX; - struct timeval tv; - unsigned long long randnum, working; - int i, tries, fd; - - length = strlen(template); - if (length < 6) { - errno = EINVAL; +static int catp(char *path, const char *dirp, const char *rpath) { + if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) { + errno = ENAMETOOLONG; return -1; } - XXXXXX = template + length - 6; - if (strcmp(XXXXXX, "XXXXXX") != 0) { - errno = EINVAL; + if (strlcat(path, rpath, PATH_MAX) >= PATH_MAX) { + errno = ENAMETOOLONG; return -1; } + return 0; +} - /* This is kind of crappy, but the point is to not step on each - other's feet */ - gettimeofday(&tv, NULL); - randnum = ((unsigned long long) tv.tv_usec << 16) ^ tv.tv_sec ^ getpid(); +static int openp(const char *dirp, const char *rpath, int flags, int mode) { + char path[PATH_MAX]; - for (tries = 0; tries < TMP_MAX; tries++) { - for (working = randnum, i = 0; i < 6; i++) { - XXXXXX[i] = letters[working % 62]; - working /= 62; - } - fd = openat(dfd, template, O_RDWR | O_CREAT | O_EXCL, 0600); - if (fd >= 0 || (errno != EEXIST && errno != EISDIR)) - return fd; + if (catp(path, dirp, rpath)) + return -1; - randnum += (tv.tv_usec >> 10) & 0xfff; - } - errno = EEXIST; - return -1; + return open(path, flags, mode); } -/* Ensure that a path exists by creating all intermediate directories */ -static int ensureat(int dfd, char *path) { - char *curp; - char *pos; +static int mkstempp(const char *dirp, char *template) { + char path[PATH_MAX]; + int res; + + if (catp(path, dirp, template)) + return -1; + + res = mkstemp(path); - curp = path; + /* We need to copy the result path back */ + if (res == 0) + memcpy(template, &path[strlen(dirp)], strlen(template)); + + return res; +} + +static int unlinkp(const char *dirp, const char *rpath) { + char path[PATH_MAX]; + + if (catp(path, dirp, rpath)) + return -1; + + return unlink(path); +} + +static int renamep(const char *dirp, const char *ropath, const char *rnpath) { + char opath[PATH_MAX]; + char npath[PATH_MAX]; + + if (catp(opath, dirp, ropath)) + return -1; + if (catp(npath, dirp, rnpath)) + return -1; - while ((pos = strchr(curp, '/')) != NULL) { - *pos = '\0'; - if (mkdirat(dfd, path, 0777)) { + return rename(opath, npath); +} + +/* Ensure that a path exists by creating all intermediate directories */ +int ensurep(const char *dirp, const char *rpath) { + char path[PATH_MAX]; + char *pp; + char sep; + + if (dirp == NULL) { + if (strlcpy(path, rpath, PATH_MAX) >= PATH_MAX) { + errno = ENAMETOOLONG; + return -1; + } +#ifdef _WIN32 + /* Skip root dir (windows) */ + pp = strchr(path, '\\'); + if (pp) + while (*pp == '\\') pp++; + else + pp = path; +#else + pp = path; + /* Skip root dir (unix) */ + while (*pp == '/') pp++; +#endif + } else { + if (catp(path, dirp, rpath)) + return -1; + + pp = path + strlen(dirp); + } + while ((pp = strpbrk(pp + 1, "\\/")) != NULL) { + sep = *pp; + *pp = '\0'; + if (mkdir(path, 0777)) { if (errno != EEXIST) return -1; /* For now we suppose that EEXIST means that the directory is - * already there.*/ + * already there. */ } - curp = pos + 1; - *pos = '/'; + *pp = sep; } return 0; @@ -130,7 +214,7 @@ static int write_entry(disk_cache *c, const cache_key_t k, if (key_path(c, k, hexp)) return -1; - if (ensureat(c->dirfd, hexp)) return -1; + if (ensurep(c->dirp, hexp)) return -1; if (strb_ensure(&b, 16)) return -1; b.l = 16; @@ -145,7 +229,7 @@ static int write_entry(disk_cache *c, const cache_key_t k, return -1; } - fd = mkstempat(c->dirfd, tmp_path); + fd = mkstempp(c->dirp, tmp_path); if (fd == -1) { strb_clear(&b); return -1; @@ -155,13 +239,18 @@ static int write_entry(disk_cache *c, const cache_key_t k, strb_clear(&b); close(fd); if (err) { - unlinkat(c->dirfd, tmp_path, 0); + unlinkp(c->dirp, tmp_path); return -1; } - if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) { - unlinkat(c->dirfd, tmp_path, 0); + if (renamep(c->dirp, tmp_path, hexp)) { + unlinkp(c->dirp, tmp_path); +#ifdef _WIN32 + /* On windows we can't rename over an existing file */ + return (errno != EACCES) ? -1 : 0; +#else return -1; +#endif } return 0; @@ -179,7 +268,7 @@ static int find_entry(disk_cache *c, const cache_key_t key, if (key_path(c, key, hexp)) return 0; - fd = openat(c->dirfd, hexp, O_RDONLY); + fd = openp(c->dirp, hexp, O_RDONLY, 0); if (fd == -1) return 0; @@ -255,7 +344,7 @@ static int disk_del(cache *_c, const cache_key_t key) { key_path(c, key, hexp); - return (unlinkat(c->dirfd, hexp, 0) == 0); + return (unlinkp(c->dirp, hexp) == 0); } static cache_value_t disk_get(cache *_c, const cache_key_t key) { @@ -277,7 +366,7 @@ static cache_value_t disk_get(cache *_c, const cache_key_t key) { static void disk_destroy(cache *_c) { disk_cache *c = (disk_cache *)_c; cache_destroy(c->mem); - close(c->dirfd); + free((void *)c->dirp); } cache *cache_disk(const char *dirpath, cache *mem, @@ -289,11 +378,10 @@ cache *cache_disk(const char *dirpath, cache *mem, if (dirp == NULL) return NULL; - if (ensureat(AT_FDCWD, dirp) != 0) { + if (ensurep(NULL, dirp) != 0) { free(dirp); return NULL; } - free(dirp); mkdir(dirpath, 0777); /* This may fail, but it's ok */ @@ -307,12 +395,7 @@ cache *cache_disk(const char *dirpath, cache *mem, if (res == NULL) return NULL; - res->dirfd = open(dirpath, O_RDONLY|O_CLOEXEC); - if (res->dirfd == -1) { - free(res); - return NULL; - } - + res->dirp = dirp; res->mem = mem; res->kwrite = kwrite; res->vwrite = vwrite; From a64059bd69e4b705d94b55ff3d8ed164319d2bde Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 17:53:19 -0500 Subject: [PATCH 225/597] Fix bug in opencl gemmBatch bindings. --- src/gpuarray_blas_opencl_clblas.c | 4 ++-- src/gpuarray_blas_opencl_clblast.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 8ee019afb7..f6e51429b1 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -100,7 +100,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, + beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; @@ -132,7 +132,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, + beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 4a5369e56e..c6fd010a3b 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -68,7 +68,7 @@ static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, float_to_half(alpha), A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - float_to_half(beta), C[i]->buf, offB[i], ldc, &ctx->q, &ev); + float_to_half(beta), C[i]->buf, offC[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -97,7 +97,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev); + beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -126,7 +126,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev); + beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); From e280a3ddf33caa61846de2c4b6ada6c9487ab548 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 17:58:02 -0500 Subject: [PATCH 226/597] Make MSVC slightly less verbose in its warnings. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 09f7f1fd4c..5d0761bc73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: # http://stackoverflow.com/q/4001736/3257826 if(MSVC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4 -D_CRT_SECURE_NO_WARNINGS") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3 -D_CRT_SECURE_NO_WARNINGS") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") endif() From 4f25bc73ed46115769374cac859ba6cc6820a80d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 18:03:35 -0500 Subject: [PATCH 227/597] Windows changes. --- CMakeLists.txt | 2 +- src/cache/disk.c | 15 ++++++++++++++- src/gpuarray_array.c | 6 +++--- src/gpuarray_buffer.c | 4 ++-- src/gpuarray_buffer_blas.c | 2 +- src/gpuarray_elemwise.c | 2 +- src/gpuarray_reduction.c | 2 +- src/private.h | 6 +++--- src/private_config.h.in | 4 +--- src/util/strb.c | 7 +++++++ 10 files changed, 34 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d0761bc73..ddfefab53c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: # http://stackoverflow.com/q/4001736/3257826 if(MSVC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3 -D_CRT_SECURE_NO_WARNINGS") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") endif() diff --git a/src/cache/disk.c b/src/cache/disk.c index 2f4fc77efe..0b827aaefa 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -1,3 +1,4 @@ +#define _CRT_SECURE_NO_WARNINGS #include #include #include @@ -11,7 +12,10 @@ #include #include +#include #include +#include +#include struct timezone; @@ -43,13 +47,22 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { return 0; } +#define open _open +#define unlink _unlink +#define mkdir(p, f) _mkdir(p) +#define close _close +#define strdup _strdup +#define lstat _stat64 +#define fstat _fstat64 +#define stat __stat64 + #else #define PATH_MAX 1024 #include #include +#include #endif -#include #include "cache.h" #include "util/skein.h" diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 91c043a1d6..267eb5badc 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -83,7 +83,7 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { } /* Value below which a size_t multiplication will never overflow. */ -#define MUL_NO_OVERFLOW (1UL << (sizeof(size_t) * 4)) +#define MUL_NO_OVERFLOW (1ULL << (sizeof(size_t) * 4)) void GpuArray_fix_flags(GpuArray *a) { /* Only keep the writable flag */ @@ -330,9 +330,9 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; - size_t nargs, apos; char *sz, *ssz; unsigned int i, i2; + unsigned int nargs, apos; int flags = GA_USE_CLUDA; int res; @@ -432,9 +432,9 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, #if DEBUG char *errstr = NULL; #endif - size_t argp; GpuKernel k; unsigned int j; + unsigned int argp; int err, kerr = 0; int addr32 = 0; diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index d3226ea94e..a1e840c939 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -45,10 +45,10 @@ gpucontext *gpucontext_init(const char *name, int dev, int flags, int *ret) { if (res == NULL) return NULL; res->ops = ops; - if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, &res->blas_ops) != GA_NO_ERROR) + if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, (void *)&res->blas_ops) != GA_NO_ERROR) res->blas_ops = NULL; res->blas_handle = NULL; - if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, &res->comm_ops) != GA_NO_ERROR) + if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, (void *)&res->comm_ops) != GA_NO_ERROR) res->comm_ops = NULL; res->extcopy_cache = NULL; return res; diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index c73f3c2f19..3fdc525e78 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -10,7 +10,7 @@ int gpublas_setup(gpucontext *ctx) { void gpublas_teardown(gpucontext *ctx) { if (ctx->blas_ops != NULL) - return ctx->blas_ops->teardown(ctx); + ctx->blas_ops->teardown(ctx); } const char *gpublas_error(gpucontext *ctx) { diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index f3ce7ee261..1d93e5a155 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -131,8 +131,8 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, strb sb = STRB_STATIC_INIT; unsigned int i, _i, j; int *ktypes; - size_t p; char *size = "ga_size", *ssize = "ga_ssize"; + unsigned int p; int flags = GA_USE_CLUDA; int res; diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 12eedb24a9..b1a185e3b7 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -644,7 +644,7 @@ static int maxandargmaxCompile (maxandargmax_ctx* ctx){ GA_SIZE, /* dstArgmaxOff */ GA_BUFFER /* dstArgmaxSteps */ }; - const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); + const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); const char* SRCS[1]; SRCS[0] = ctx->sourceCode; diff --git a/src/private.h b/src/private.h index abe9783de7..2de8742674 100644 --- a/src/private.h +++ b/src/private.h @@ -26,9 +26,9 @@ extern "C" { } #endif -#define ADDR32_MAX 4294967295 -#define SADDR32_MIN -2147483648 -#define SADDR32_MAX 2147483647 +#define ADDR32_MAX 4294967295L +#define SADDR32_MIN -2147483648L +#define SADDR32_MAX 2147483647L struct _gpuarray_buffer_ops; typedef struct _gpuarray_buffer_ops gpuarray_buffer_ops; diff --git a/src/private_config.h.in b/src/private_config.h.in index c3cd3a0195..f58a03edae 100644 --- a/src/private_config.h.in +++ b/src/private_config.h.in @@ -22,9 +22,7 @@ extern "C" { #ifdef _MSC_VER /* God damn Microsoft ... */ #define snprintf _snprintf -#endif - -#ifdef _MSC_VER +#define strdup _strdup /* MS VC++ 2008 does not support inline */ #define inline __inline #define alloca _alloca diff --git a/src/util/strb.c b/src/util/strb.c index 15cd496c4f..22da8bf637 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -1,6 +1,13 @@ +#define _CRT_SECURE_NO_WARNINGS #include #include +#ifdef _MSC_VER +#include +#define read _read +#define write _write +#else #include +#endif #include "util/strb.h" From 0678d76e02a8bc21f06a586bd4e719d633db49b9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 15:47:21 -0400 Subject: [PATCH 228/597] Initialized the blas_ops pointer so that compilers stop freaking out. --- src/gpuarray_buffer_opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 89b56f80a7..e1e8fdd82a 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -212,7 +212,7 @@ cl_command_queue cl_get_stream(gpucontext *ctx) { } static void cl_free_ctx(cl_ctx *ctx) { - gpuarray_blas_ops *blas_ops; + gpuarray_blas_ops *blas_ops = NULL; ASSERT_CTX(ctx); assert(ctx->refcnt != 0); From f761cfaeb0705d4505caa382ce24acc0ccbd86a6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 15:47:45 -0400 Subject: [PATCH 229/597] Add support for floats in GpuArray_dump(). --- src/gpuarray_array.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 267eb5badc..45a2a1186e 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -1096,6 +1096,9 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) { case GA_LONG: fprintf(fd, "%lld", (long long)*(int64_t *)p); break; + case GA_FLOAT: + fprintf(fd, "%f", *(float *)p); + break; case GA_SSIZE: fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p); break; From eca4ffd3cf1e3056f70b4e6263db54c7427934fe Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 16:20:39 -0400 Subject: [PATCH 230/597] Fix mkstempp to actually return the filename. --- src/cache/disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 0b827aaefa..3ded869829 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -129,7 +129,7 @@ static int mkstempp(const char *dirp, char *template) { res = mkstemp(path); /* We need to copy the result path back */ - if (res == 0) + if (res != -1) memcpy(template, &path[strlen(dirp)], strlen(template)); return res; From b78e9d42805174bdf2e26f0da1076492eb71c14e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 18:14:35 -0400 Subject: [PATCH 231/597] Fix windows annoyances. --- make.bat | 2 +- setup.py | 2 +- src/gpuarray/blas.h | 4 ++-- src/gpuarray_array_blas.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/make.bat b/make.bat index 1ea7aa55cd..16bc79f441 100755 --- a/make.bat +++ b/make.bat @@ -1,6 +1,6 @@ del bld mkdir bld cd bld -cmake .. -G "NMake Makefiles" +cmake .. -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release cmake --build . --config Release cd .. diff --git a/setup.py b/setup.py index 0084e04b4a..10f5996b54 100755 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ def __init__(self, *args, **kwargs): current_dir = os.path.abspath(os.path.dirname(__file__)) include_dirs += [os.path.join(current_dir, 'src')] - default_bin_dir = os.path.join(current_dir, 'lib', 'Release') + default_bin_dir = os.path.join(current_dir, 'lib') if not os.path.isdir(default_bin_dir): raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir)) library_dirs += [default_bin_dir] diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index a8dd8096bc..a59d3bb885 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -9,8 +9,8 @@ extern "C" { #endif // only for vector-vector dot -GPUARRAY_PUBLIC int GpuArray_rdot( GpuArray *X, GpuArray *Y, - GpuArray *Z, int nocopy); +GPUARRAY_PUBLIC int GpuArray_rdot(GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy); #define GpuArray_hdot GpuArray_rdot #define GpuArray_sdot GpuArray_rdot #define GpuArray_ddot GpuArray_rdot diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 8f9fb5919b..2e9a398e2d 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -5,8 +5,8 @@ #include "gpuarray/util.h" #include "gpuarray/error.h" -int GpuArray_rdot( GpuArray *X, GpuArray *Y, - GpuArray *Z, int nocopy) { +int GpuArray_rdot(GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy) { GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; From 88601038668cc607c954ceee9af1a4983ec213de Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 19:07:29 -0400 Subject: [PATCH 232/597] Make sure to properly terminate the cache path with a separator and work around lstat choking on terminating separators on windows. --- src/cache/disk.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 3ded869829..6d260ac559 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -387,20 +387,45 @@ cache *cache_disk(const char *dirpath, cache *mem, kread_fn kread, vread_fn vread) { struct stat st; disk_cache *res; - char *dirp = strdup(dirpath); + char *dirp; + size_t dirl = strlen(dirpath); + char sep = '/'; + + /* This trickery is to make sure the path ends with a separator */ +#ifdef _WIN32 + if (dirpath[dirl - 1] == '\\') + sep = '\\'; +#endif + + if (dirpath[dirl - 1] != sep) dirl++; + + dirp = malloc(dirl + 1); /* With the NUL */ if (dirp == NULL) return NULL; + strlcpy(dirp, dirpath, dirl + 1); + + if (dirp[dirl - 1] != sep) { + dirp[dirl - 1] = sep; + dirp[dirl] = '\0'; + } + if (ensurep(NULL, dirp) != 0) { free(dirp); return NULL; } - mkdir(dirpath, 0777); /* This may fail, but it's ok */ + /* For Windows mkdir and lstat which can't handle trailing separator */ + dirp[dirl - 1] = '\0'; - if (lstat(dirpath, &st) != 0) + mkdir(dirp, 0777); /* This may fail, but it's ok */ + + if (lstat(dirp, &st) != 0) return NULL; + /* Restore the good path at the end */ + dirp[dirl - 1] = sep; + if (!(st.st_mode & S_IFDIR)) return NULL; From 97d855339c78cfc30fc4566d9828d1dbfa95657f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 19:46:41 -0400 Subject: [PATCH 233/597] Make sure to open cache files in binary mode for windows. --- src/cache/disk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 6d260ac559..db0917b1f4 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -61,6 +61,9 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { #include #include #include + +#define O_BINARY 0 + #endif @@ -281,7 +284,7 @@ static int find_entry(disk_cache *c, const cache_key_t key, if (key_path(c, key, hexp)) return 0; - fd = openp(c->dirp, hexp, O_RDONLY, 0); + fd = openp(c->dirp, hexp, O_RDONLY|O_BINARY, 0); if (fd == -1) return 0; From 46c2f08fca5bcd5e9bc54351b68a4ef4b886abe5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 18:32:06 -0400 Subject: [PATCH 234/597] Don't crash on key read failure. --- src/cache/disk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index db0917b1f4..f77128906f 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -337,7 +337,8 @@ static int find_entry(disk_cache *c, const cache_key_t key, return 1; } error: - c->c.kfree(k); + if (k) + c->c.kfree(k); b.s = ts; strb_clear(&b); return 0; From 01e5fa22706d48b8fde4e3aa39fa809c00d3f59d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 18:33:26 -0400 Subject: [PATCH 235/597] Error out if we reach EOF before the passed-in length. --- src/util/strb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/strb.c b/src/util/strb.c index 22da8bf637..c8ae4da25d 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -73,8 +73,8 @@ void strb_read(strb *sb, int fd, size_t sz) { sb->l += sz; while (sz) { res = read(fd, b, sz); - if (res == -1) { - if (errno == EAGAIN || errno == EINTR) + if (res == -1 || res == 0) { + if (res == -1 && errno == EAGAIN || errno == EINTR) continue; strb_seterror(sb); return; From f03754961edddc597c3c44e78ec5c453159bd84c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 19:14:47 -0400 Subject: [PATCH 236/597] Make sure to open files in binary mode. --- src/gpuarray_mkstemp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_mkstemp.c b/src/gpuarray_mkstemp.c index ac5ea10940..5e2e8ca520 100644 --- a/src/gpuarray_mkstemp.c +++ b/src/gpuarray_mkstemp.c @@ -8,6 +8,8 @@ #include #define open _open #define mktemp _mktemp +#else +#define O_BINARY 0 #endif int mkstemp(char *path) { @@ -18,7 +20,7 @@ int mkstemp(char *path) { do { tmp = mktemp(path); if (tmp == NULL) return -1; - res = open(path, O_CREAT|O_EXCL|O_RDWR, S_IREAD|S_IWRITE); + res = open(path, O_CREAT|O_EXCL|O_RDWR|O_BINARY, S_IREAD|S_IWRITE); if (res != -1 || errno != EEXIST) return res; } while (--tries); From a9ebffa0d0056b3a4d6e8fac12dc27f8e8a7fbfd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 19:17:28 -0400 Subject: [PATCH 237/597] Make sure to open files in binary mode for writing too. --- src/cache/disk.c | 7 +++++-- src/util/strb.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index f77128906f..a7f5dbbcd7 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -63,6 +63,7 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { #include #define O_BINARY 0 +#define setmode(a, b) #endif @@ -131,9 +132,11 @@ static int mkstempp(const char *dirp, char *template) { res = mkstemp(path); - /* We need to copy the result path back */ - if (res != -1) + /* We need to copy the result path back and set binary mode (for windows) */ + if (res != -1) { + setmode(res, O_BINARY); memcpy(template, &path[strlen(dirp)], strlen(template)); + } return res; } diff --git a/src/util/strb.c b/src/util/strb.c index c8ae4da25d..dda9dcdfc2 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -74,7 +74,7 @@ void strb_read(strb *sb, int fd, size_t sz) { while (sz) { res = read(fd, b, sz); if (res == -1 || res == 0) { - if (res == -1 && errno == EAGAIN || errno == EINTR) + if (res == -1 && (errno == EAGAIN || errno == EINTR)) continue; strb_seterror(sb); return; From 3c6fb5dc645c0c8364f4586634b9b5e2e48deeb0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 20:15:10 -0400 Subject: [PATCH 238/597] Fix ntohull for platforms that have signed chars. --- src/cache/disk.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index a7f5dbbcd7..6a5e086a10 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -63,7 +63,7 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { #include #define O_BINARY 0 -#define setmode(a, b) +#define _setmode(a, b) #endif @@ -84,7 +84,8 @@ typedef struct _disk_cache { } disk_cache; -static unsigned long long ntohull(const char *in) { +static unsigned long long ntohull(const char *_in) { + const unsigned char *in = (const unsigned char *)_in; return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 | (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 | (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 | @@ -92,14 +93,14 @@ static unsigned long long ntohull(const char *in) { } static void htonull(unsigned long long in, char *out) { - out[0] = (char)(in >> 56); - out[1] = (char)(in >> 48); - out[2] = (char)(in >> 40); - out[3] = (char)(in >> 32); - out[4] = (char)(in >> 24); - out[5] = (char)(in >> 16); - out[6] = (char)(in >> 8); - out[7] = (char)(in); + out[0] = (unsigned char)(in >> 56); + out[1] = (unsigned char)(in >> 48); + out[2] = (unsigned char)(in >> 40); + out[3] = (unsigned char)(in >> 32); + out[4] = (unsigned char)(in >> 24); + out[5] = (unsigned char)(in >> 16); + out[6] = (unsigned char)(in >> 8); + out[7] = (unsigned char)(in); } static int catp(char *path, const char *dirp, const char *rpath) { @@ -134,7 +135,7 @@ static int mkstempp(const char *dirp, char *template) { /* We need to copy the result path back and set binary mode (for windows) */ if (res != -1) { - setmode(res, O_BINARY); + _setmode(res, O_BINARY); memcpy(template, &path[strlen(dirp)], strlen(template)); } From 26b85ee432bcbc261f5760ac55cdc3be2b15eeb6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 20 Mar 2017 17:33:42 -0400 Subject: [PATCH 239/597] Make GpuKernel_binary work again. --- src/gpuarray/buffer.h | 17 +++++++++++++++-- src/gpuarray/error.h | 1 - src/gpuarray/kernel.h | 1 - src/gpuarray_buffer.c | 2 +- src/gpuarray_buffer_cuda.c | 19 +++++++++++++++---- src/gpuarray_buffer_opencl.c | 29 +++++++++++++++++++++++++++++ src/gpuarray_error.c | 1 - src/private.h | 1 + src/private_cuda.h | 2 ++ 9 files changed, 63 insertions(+), 10 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 34878d503a..800756a072 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -494,9 +494,22 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, size_t shared, void **args); /** - * Get the kernel binary (REMOVED). + * (Deprecated) Get the kernel binary. * - * Always returns GA_DEPRECATED_ERROR. + * This function is deprecated and will be removed in the next release. + * + * This can be use to cache kernel binaries after compilation of a + * specific device. The kernel can be recreated by calling + * kernel_alloc with the binary and size and passing `GA_USE_BINARY` + * as the use flags. + * + * The returned pointer is allocated and must be freed by the caller. + * + * \param k kernel + * \param sz size of the returned binary + * \param obj pointer to the binary for the kernel. + * + * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h index 84c852a257..af963c1531 100644 --- a/src/gpuarray/error.h +++ b/src/gpuarray/error.h @@ -36,7 +36,6 @@ enum ga_error { GA_COMM_ERROR, GA_XLARGE_ERROR, GA_LOAD_ERROR, - GA_DEPRECATED_ERROR, /* Add more error types if needed, but at the end */ /* Don't forget to sync with Gpu_error() */ }; diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index da779123b9..f88d74ffc6 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -107,7 +107,6 @@ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); -/* Deprecated and to be removed */ GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index a1e840c939..dee65f130c 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -187,7 +187,7 @@ int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, } int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) { - return GA_DEPRECATED_ERROR; + return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj); } int gpukernel_property(gpukernel *k, int prop_id, void *res) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 7b1f540c26..fe0c100508 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1151,6 +1151,7 @@ static void _cuda_freekernel(gpukernel *k) { } CLEAR(k); free(k->args); + free(k->bin); free(k->types); free(k); } @@ -1264,13 +1265,15 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, FAIL(NULL, GA_SYS_ERROR); } + /* Don't clear bin after this */ + res->bin_sz = bin.l; + res->bin = bin.s; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); strb_clear(&src); - strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } @@ -1279,7 +1282,6 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, if (res->args == NULL) { _cuda_freekernel(res); strb_clear(&src); - strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } @@ -1288,11 +1290,9 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); strb_clear(&src); - strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } - strb_clear(&bin); ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { @@ -1390,6 +1390,16 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } +static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) { + void *res = malloc(k->bin_sz); + if (res == NULL) + return GA_MEMORY_ERROR; + memcpy(res, k->bin, k->bin_sz); + *sz = k->bin_sz; + *obj = res; + return GA_NO_ERROR; +} + static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; @@ -1792,6 +1802,7 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_freekernel, cuda_kernelsetarg, cuda_callkernel, + cuda_kernelbin, cuda_sync, cuda_transfer, cuda_property, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index e1e8fdd82a..3e87f23c41 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1076,6 +1076,34 @@ static int cl_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } +static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) { + cl_ctx *ctx = k->ctx; + cl_program p; + size_t rsz; + void *res; + + ASSERT_KER(k); + ASSERT_CTX(ctx); + + ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL); + if (ctx->err != CL_SUCCESS) + return GA_IMPL_ERROR; + ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL); + if (ctx->err != CL_SUCCESS) + return GA_IMPL_ERROR; + res = malloc(rsz); + if (res == NULL) + return GA_MEMORY_ERROR; + ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); + if (ctx->err != CL_SUCCESS) { + free(res); + return GA_IMPL_ERROR; + } + *sz = rsz; + *obj = res; + return GA_NO_ERROR; +} + static int cl_sync(gpudata *b) { cl_ctx *ctx = (cl_ctx *)b->ctx; @@ -1437,6 +1465,7 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_releasekernel, cl_setkernelarg, cl_callkernel, + cl_kernelbin, cl_sync, cl_transfer, cl_property, diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c index ddebd3e9dc..b7d5011f5b 100644 --- a/src/gpuarray_error.c +++ b/src/gpuarray_error.c @@ -25,7 +25,6 @@ const char *gpuarray_error_str(int err) { case GA_COMM_ERROR: return "Error in collectives call"; case GA_XLARGE_ERROR: return "Input size too large for operation"; case GA_LOAD_ERROR: return "Error loading library"; - case GA_DEPRECATED_ERROR: return "Deprecated (removed) functionality"; default: return "Unknown GA error"; } } diff --git a/src/private.h b/src/private.h index 2de8742674..820ebb6287 100644 --- a/src/private.h +++ b/src/private.h @@ -100,6 +100,7 @@ struct _gpuarray_buffer_ops { const size_t *gs, const size_t *ls, size_t shared, void **args); + int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj); int (*buffer_sync)(gpudata *b); int (*buffer_transfer)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); diff --git a/src/private_cuda.h b/src/private_cuda.h index a0b4557977..ad9ff7f8ae 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -137,6 +137,8 @@ struct _gpukernel { CUmodule m; CUfunction k; void **args; + size_t bin_sz; + void *bin; int *types; unsigned int argcount; unsigned int refcnt; From 330703150aaaa7614faa8a524319c9e8977e8ec1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 11:54:09 -0400 Subject: [PATCH 240/597] Add a description of the purpose of make.bat. --- make.bat | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/make.bat b/make.bat index 16bc79f441..3402206e00 100755 --- a/make.bat +++ b/make.bat @@ -1,3 +1,7 @@ +REM This helps repetitive builds on windows +REM It needs the compiler you want to use to be available in the shell +REM and it will build a release version + del bld mkdir bld cd bld From e62616b4d6a04c2ba3fdcba4f3d000df3ff7d0a6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 11:54:34 -0400 Subject: [PATCH 241/597] Clean up the cache management script. --- bin/gpuarray-cache | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache index 04b7e8e68a..3cfb429a76 100644 --- a/bin/gpuarray-cache +++ b/bin/gpuarray-cache @@ -1,11 +1,11 @@ #!/usr/bin/env python import os +import sys -def clean(max_size): +def clean(max_size, path): content = [] - for root, dirs, files in os.walk(os.environ.get('GPUARRAY_CACHE', - '~/.gpuarray/cache/')): + for root, dirs, files in os.walk(path): for file in files: fpath = os.path.join(root, file) st = os.stat(fpath) @@ -25,18 +25,18 @@ SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40, def get_size(s): i = 0 - while i < len(s) and (s[i].isdigit() or s[i] == '.'): - i += 1 - num = s[:i] - suf = s[i:] + s = s.strip() + if s[-1].upper() in SUFFIXES: + num = s[:-1] + suf = s[-1].upper() + else: + num = s + suf = "" num = float(num) if suf != "": - letter = suf.strip().upper() - if letter not in SUFFIXES: - raise ValueError("can't interpret %r" % init) - mult = SUFFIXES[letter] + mult = SUFFIXES[suf] else: - mult = 0 + mult = 1 return int(num * mult) @@ -46,6 +46,10 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility') parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)') args = parser.parse_args() + path = os.environ.get('GPUARRAY_CACHE_PATH', None) + if path is None: + print("You need to set GPUARRAY_CACHE_PATH so that this programs knows which path to clean.") + sys.exit(1) - clean(get_size(args.max_size)) + clean(get_size(args.max_size), path) From 1a1cab6a1307dd0f5fcfa2dc3bed73cc532573a3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 11:56:17 -0400 Subject: [PATCH 242/597] Add the appropriate bumps. --- setup.py | 4 ++-- src/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 10f5996b54..7ec8d3af93 100755 --- a/setup.py +++ b/setup.py @@ -5,8 +5,8 @@ MAJOR = 0 MINOR = 6 -PATCH = 2 -SUFFIX = '' +PATCH = 3 +SUFFIX = '.dev0' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0ed776533d..a45db024ff 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,7 +89,7 @@ set_target_properties(gpuarray PROPERTIES INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version - VERSION 2.0 + VERSION 2.1 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) From 878566be656aa4eb116c2f5aea70c6818b7e37f2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 12:13:42 -0400 Subject: [PATCH 243/597] Add comments and fix style. --- src/gpuarray_buffer_cuda.c | 2 +- src/private_cuda.h | 2 +- src/util/strb.h | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index fe0c100508..4612573f5f 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1034,7 +1034,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { if (strb_ensure(ptx, buflen) == 0) { err = nvrtcGetPTX(prog, ptx->s+ptx->l); - if (err == NVRTC_SUCCESS) ptx->l = buflen; + if (err == NVRTC_SUCCESS) ptx->l += buflen; } end: diff --git a/src/private_cuda.h b/src/private_cuda.h index ad9ff7f8ae..dc81ceba52 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -68,7 +68,7 @@ typedef struct _cuda_context { CUstream mem_s; gpudata *freeblocks; cache *kernel_cache; - cache *disk_cache; + cache *disk_cache; // This is per-context to avoid lock contention unsigned int enter; unsigned char major; unsigned char minor; diff --git a/src/util/strb.h b/src/util/strb.h index 01ea7a2495..3289de5796 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -46,7 +46,7 @@ strb *strb_alloc(size_t s); * * Don't call this for stack of global declarations, see strb_clear() instead. */ -void strb_free(strb *); +void strb_free(strb *sb); /* * Return a pointer to a dynamically allocated strb with a default @@ -96,7 +96,7 @@ static inline void strb_clear(strb *sb) { * This should almost never be called directly. Use strb_ensure() * instead. */ -int strb_grow(strb *, size_t s); +int strb_grow(strb *sb, size_t s); /* * Make sure there is space to store at least `s` bytes of data after @@ -159,14 +159,17 @@ static inline void strb_appendb(strb *sb, const strb *sb2) { * * A format error will place the strb in error mode. */ -void strb_appendf(strb *, const char *f, ...); +void strb_appendf(strb *sb, const char *f, ...); /* * Reads from the file specified by the given file descriptor. * + * This will read `sz` bytes from the file descriptor. Insufficient + * data is handled as a read error. + * * A read error will place the strb in error mode. */ -void strb_read(strb *, int fd, size_t sz); +void strb_read(strb *sb, int fd, size_t sz); /* * Write the content of an strb to the specified file descriptor. From 6934341bc62d7d286e7ed849c1af99d4ec2caec3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 12:36:06 -0400 Subject: [PATCH 244/597] Add some explanatory comments. --- src/cache/disk.c | 5 +++++ src/gpuarray_buffer_cuda.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 6a5e086a10..601690a421 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -84,6 +84,7 @@ typedef struct _disk_cache { } disk_cache; +/* Convert unsigned long long from network to host order */ static unsigned long long ntohull(const char *_in) { const unsigned char *in = (const unsigned char *)_in; return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 | @@ -92,6 +93,7 @@ static unsigned long long ntohull(const char *_in) { (unsigned long long)in[6] << 8 | (unsigned long long)in[7]); } +/* Convert unsigned long long from host to network order */ static void htonull(unsigned long long in, char *out) { out[0] = (unsigned char)(in >> 56); out[1] = (unsigned char)(in >> 48); @@ -103,6 +105,8 @@ static void htonull(unsigned long long in, char *out) { out[7] = (unsigned char)(in); } +/* Concatenate prefix and suffix into a single path string while + checking for overflow */ static int catp(char *path, const char *dirp, const char *rpath) { if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) { errno = ENAMETOOLONG; @@ -115,6 +119,7 @@ static int catp(char *path, const char *dirp, const char *rpath) { return 0; } +/* open() for a path specifed by the concatenation of dirp and rpath */ static int openp(const char *dirp, const char *rpath, int flags, int mode) { char path[PATH_MAX]; diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 4612573f5f..8614eced61 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1025,7 +1025,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { strb_appends(log, "NVRTC compile log::\n"); if (strb_ensure(log, buflen) == 0) if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS) - log->l += buflen - 1; + log->l += buflen - 1; // Remove the final NUL strb_appendc(log, '\n'); } From 137395c005a430f5a951be8397d5eea949b8f73d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 18:56:48 -0400 Subject: [PATCH 245/597] Fix aliasing of src strb between the caches. --- src/gpuarray_buffer_cuda.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 8614eced61..a406a85852 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1120,17 +1120,23 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { err = make_bin(ctx, &ptx, bin, log); if (err != GA_NO_ERROR) return err; if (ctx->disk_cache) { - pk = memdup(&k, sizeof(k)); + pk = calloc(sizeof(kernel_key), 1); if (pk == NULL) return GA_NO_ERROR; + memcpy(pk->bin_id, k.bin_id, 64); + strb_appendb(&pk->src, src); + if (strb_error(&pk->src)) { + key_free((cache_key_t)pk); + return GA_NO_ERROR; + } cbin = strb_alloc(bin->l); if (cbin == NULL) { - free(pk); + key_free((cache_key_t)pk); return GA_NO_ERROR; } strb_appendb(cbin, bin); if (strb_error(cbin)) { - free(pk); + key_free((cache_key_t)pk); strb_free(cbin); return GA_NO_ERROR; } From e3f7562a0e35cfc0b09f4689ce2ab413ba7c1fe4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 3 Apr 2017 16:18:45 -0400 Subject: [PATCH 246/597] Fix contiguous detection to properly handle F-contiguous input. --- src/gpuarray_array_blas.c | 58 +++++++++++++++++++++++---------- src/gpuarray_blas_cuda_cublas.c | 6 ++-- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 8f9fb5919b..707c7c1510 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -439,6 +439,23 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, return err; } +static inline int is_2d_contiguous(const GpuArray *a) { + size_t size = GpuArray_ITEMSIZE(a); + + if (GpuArray_IS_C_CONTIGUOUS(a)) + return 1; // C contiguous + + if (a->strides[1] <= 0 || a->strides[2] <= 0) + return 0; + + if (a->strides[1] == size) + return 2; // F contiguous + if (a->strides[2] == size) + return 1; // C contiguous + + return 0; +} + int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, int nocopy) { @@ -451,6 +468,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph size_t elsize; size_t batchCount, m, n, k, lda, ldb, ldc; cb_order o; + int cA, cB, cC; int err; gpudata **A_datas = NULL, **B_datas = NULL, **C_datas = NULL; size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL; @@ -495,52 +513,56 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph elsize = gpuarray_get_elsize(A->typecode); - // FIXME: these conditions are overly restrictive; the first axis need not be contiguous - if (!GpuArray_ISONESEGMENT(A)) { + cA = is_2d_contiguous(A); + if (!cA) { if (nocopy) return GA_COPY_ERROR; else { - err = GpuArray_copy(©A, A, GA_F_ORDER); + err = GpuArray_copy(©A, A, GA_C_ORDER); + cA = 1; if (err != GA_NO_ERROR) goto cleanup; Ap = ©A; } } - if (!GpuArray_ISONESEGMENT(B)) { + cB = is_2d_contiguous(B); + if (!cB) { if (nocopy) return GA_COPY_ERROR; else { - err = GpuArray_copy(©B, B, GA_F_ORDER); + err = GpuArray_copy(©B, B, GA_C_ORDER); + cB = 1; if (err != GA_NO_ERROR) goto cleanup; Bp = ©B; } } - if (!GpuArray_ISONESEGMENT(C)) { + cC = is_2d_contiguous(C); + if (!cC) { err = GA_VALUE_ERROR; goto cleanup; } - if (Cp->flags & GA_F_CONTIGUOUS) { + if (cC == 2) { o = cb_fortran; - ldc = Cp->dimensions[1]; - } else if (Cp->flags & GA_C_CONTIGUOUS) { + ldc = Cp->strides[2] / elsize; + } else if (cC == 1) { o = cb_c; - ldc = Cp->dimensions[2]; + ldc = Cp->strides[1] / elsize; } else { err = GA_VALUE_ERROR; goto cleanup; } - if (Ap->flags & GA_F_CONTIGUOUS) { - lda = Ap->dimensions[1]; + if (cA == 2) { + lda = Ap->strides[2] / elsize; if (o == cb_c) { if (transA == cb_no_trans) transA = cb_trans; else transA = cb_no_trans; } - } else if (Ap->flags & GA_C_CONTIGUOUS) { - lda = Ap->dimensions[2]; + } else if (cA == 1) { + lda = Ap->strides[1] / elsize; if (o == cb_fortran) { if (transA == cb_no_trans) transA = cb_trans; @@ -551,16 +573,16 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph err = GA_VALUE_ERROR; goto cleanup; } - if (Bp->flags & GA_F_CONTIGUOUS) { - ldb = Bp->dimensions[1]; + if (cB == 2) { + ldb = Bp->strides[2] / elsize; if (o == cb_c) { if (transB == cb_no_trans) transB = cb_trans; else transB = cb_no_trans; } - } else if (Bp->flags & GA_C_CONTIGUOUS) { - ldb = Bp->dimensions[2]; + } else if (cB == 1) { + ldb = Bp->strides[1] / elsize; if (o == cb_fortran) { if (transB == cb_no_trans) transB = cb_trans; diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 6d4648e232..a9018f155f 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -589,10 +589,10 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, h->err = cublasSgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, - (float*)A[i]->ptr + offA[i], lda, - (float*)B[i]->ptr + offB[i], ldb, + ((float*)A[i]->ptr) + offA[i], lda, + ((float*)B[i]->ptr) + offB[i], ldb, &beta, - (float*)C[i]->ptr + offC[i], ldc); + ((float*)C[i]->ptr) + offC[i], ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) From 89a2f44f532e665b944fd3ee4e779dd8cbb60c60 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 4 Apr 2017 13:42:50 -0400 Subject: [PATCH 247/597] Rename is_2d_contiguous and add some tests. --- src/gpuarray_array.c | 3 ++ src/gpuarray_array_blas.c | 14 +++--- tests/check_blas.c | 90 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 97 insertions(+), 10 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 91c043a1d6..159d33694e 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -1096,6 +1096,9 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) { case GA_LONG: fprintf(fd, "%lld", (long long)*(int64_t *)p); break; + case GA_FLOAT: + fprintf(fd, "%f", *(float *)p); + break; case GA_SSIZE: fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p); break; diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 707c7c1510..48e29d05ab 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -439,18 +439,18 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, return err; } -static inline int is_2d_contiguous(const GpuArray *a) { +static inline int is_last_2d_contiguous(const GpuArray *a) { size_t size = GpuArray_ITEMSIZE(a); if (GpuArray_IS_C_CONTIGUOUS(a)) return 1; // C contiguous - if (a->strides[1] <= 0 || a->strides[2] <= 0) + if (a->strides[a->nd - 2] <= 0 || a->strides[a->nd - 1] <= 0) return 0; - if (a->strides[1] == size) + if (a->strides[a->nd - 2] == size) return 2; // F contiguous - if (a->strides[2] == size) + if (a->strides[a->nd - 1] == size) return 1; // C contiguous return 0; @@ -513,7 +513,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph elsize = gpuarray_get_elsize(A->typecode); - cA = is_2d_contiguous(A); + cA = is_last_2d_contiguous(A); if (!cA) { if (nocopy) return GA_COPY_ERROR; @@ -525,7 +525,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph Ap = ©A; } } - cB = is_2d_contiguous(B); + cB = is_last_2d_contiguous(B); if (!cB) { if (nocopy) return GA_COPY_ERROR; @@ -537,7 +537,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph Bp = ©B; } } - cC = is_2d_contiguous(C); + cC = is_last_2d_contiguous(C); if (!cC) { err = GA_VALUE_ERROR; goto cleanup; diff --git a/tests/check_blas.c b/tests/check_blas.c index 99098fc8d0..fca6b9f0f2 100644 --- a/tests/check_blas.c +++ b/tests/check_blas.c @@ -14,18 +14,100 @@ void teardown(void); #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) -START_TEST(test_gemmBatch_3d) { +static inline void ck_assert_fbuf_eq(const float *b, const float *r, + unsigned int n) { + unsigned int i; + for (i = 0; i < n; i++) { + ck_assert_msg(b[i] == r[i], "Difference at %u: %f != %f(ref)", i, b[i], r[i]); + } +} + +START_TEST(test_gemmBatch_3d_C) { GpuArray A; GpuArray B; GpuArray C; - size_t dims[3] = {32, 32, 32}; + size_t dims[3] = {2, 3, 3}; + float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, + 1, 2, 3, 4, 5, 6, 7, 8, 9}; + const float res[] = {30, 36, 42, 66, 81, 96, 102, 126, 150, + 30, 36, 42, 66, 81, 96, 102, 126, 150}; ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_write(&A, data, sizeof(data))); + ga_assert_ok(GpuArray_write(&B, data, sizeof(data))); + ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 1)); + + ga_assert_ok(GpuArray_read(data, sizeof(data), &C)); + + ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float)); +} +END_TEST + +START_TEST(test_gemmBatch_3d_F) { + GpuArray A; + GpuArray B; + GpuArray C; + + size_t dims[3] = {2, 3, 3}; + float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, + 1, 2, 3, 4, 5, 6, 7, 8, 9}; + const float res[] = {42, 78, 78, 60, 114, 114, 51, 69, 96, + 66, 39, 111, 54, 54, 90, 78, 78, 132}; + + ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_F_ORDER)); + ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_F_ORDER)); + ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + + ga_assert_ok(GpuArray_write(&A, data, sizeof(data))); + ga_assert_ok(GpuArray_write(&B, data, sizeof(data))); + + ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 0)); + + ga_assert_ok(GpuArray_read(data, sizeof(data), &C)); + + ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float)); +} +END_TEST + +START_TEST(test_gemmBatch_3d_S) { + GpuArray A; + GpuArray B; + GpuArray C; + ssize_t t; + + size_t dims[3] = {2, 3, 3}; + float data[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, + 1, 2, 3, 4, 5, 6, 7, 8, 9}; + const float res[] = {14, 32, 50, 50, 122, 194, 32, 77, 122, + 26, 62, 98, 17, 53, 89, 44, 107, 170}; + + ga_assert_ok(GpuArray_empty(&A, ctx, GA_FLOAT, 3, dims, GA_F_ORDER)); + ga_assert_ok(GpuArray_empty(&B, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&C, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); + + ga_assert_ok(GpuArray_write(&A, data, sizeof(data))); + ga_assert_ok(GpuArray_write(&B, data, sizeof(data))); + + A.strides[0] = 8; + A.strides[1] = 24; + A.strides[2] = 4; + GpuArray_fix_flags(&A); + + t = B.strides[1]; + B.strides[1] = B.strides[2]; + B.strides[2] = t; + GpuArray_fix_flags(&B); + + ga_assert_ok(GpuArray_rgemmBatch_3d(cb_no_trans, cb_no_trans, 1, &A, &B, 0, &C, 1)); + + ga_assert_ok(GpuArray_read(data, sizeof(data), &C)); + + ck_assert_fbuf_eq(data, res, sizeof(res)/sizeof(float)); } END_TEST @@ -34,7 +116,9 @@ Suite *get_suite(void) { TCase *tc = tcase_create("all"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 16.0); - tcase_add_test(tc, test_gemmBatch_3d); + tcase_add_test(tc, test_gemmBatch_3d_C); + tcase_add_test(tc, test_gemmBatch_3d_F); + tcase_add_test(tc, test_gemmBatch_3d_S); suite_add_tcase(s, tc); return s; } From a708ed715ada7976d69ec44cf2ad4507b5e6f36b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 4 Apr 2017 17:22:53 -0400 Subject: [PATCH 248/597] Add shitty error messages when the disk cache fails. They will be upgraded to better message with the PR about better error messages. --- src/gpuarray_buffer_cuda.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index a406a85852..bbef0ab2a0 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -224,14 +224,19 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (cache_hash_fn)key_hash, (cache_freek_fn)key_free, (cache_freev_fn)strb_free); - if (mem_cache == NULL) + if (mem_cache == NULL) { + // TODO use better error messages when they are available. + fprintf(stderr, "Error initializing disk cache, disabling\n"); goto fail_disk_cache; + } res->disk_cache = cache_disk(cache_path, mem_cache, (kwrite_fn)key_write, (vwrite_fn)kernel_write, (kread_fn)key_read, (vread_fn)kernel_read); if (res->disk_cache == NULL) { + // TODO use better error messages when they are available. + fprintf(stderr, "Error initializing disk cache, disabling\n"); cache_destroy(mem_cache); goto fail_disk_cache; } @@ -1121,26 +1126,38 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { if (err != GA_NO_ERROR) return err; if (ctx->disk_cache) { pk = calloc(sizeof(kernel_key), 1); - if (pk == NULL) + if (pk == NULL) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); return GA_NO_ERROR; + } memcpy(pk->bin_id, k.bin_id, 64); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); key_free((cache_key_t)pk); return GA_NO_ERROR; } cbin = strb_alloc(bin->l); if (cbin == NULL) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); key_free((cache_key_t)pk); return GA_NO_ERROR; } strb_appendb(cbin, bin); if (strb_error(cbin)) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); key_free((cache_key_t)pk); strb_free(cbin); return GA_NO_ERROR; } - cache_add(ctx->disk_cache, pk, cbin); + if (cache_add(ctx->disk_cache, pk, cbin)) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); + } } return GA_NO_ERROR; From e1bd3c803c5ab9bd2bfcb1721bf6f741686e07f7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 4 Apr 2017 18:20:51 -0400 Subject: [PATCH 249/597] Add additional info to the kernel cache key. --- src/gpuarray_buffer_cuda.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index bbef0ab2a0..894de1a8f3 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -50,10 +50,17 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); typedef struct _kernel_key { + uint8_t version; + uint8_t debug; + uint8_t major; + uint8_t minor; + uint32_t reserved; char bin_id[64]; strb src; } kernel_key; +#define KERNEL_KEY_MM (sizeof(kernel_key) - sizeof(strb)) + static void key_free(cache_key_t _k) { kernel_key *k = (kernel_key *)_k; strb_clear(&k->src); @@ -70,36 +77,36 @@ static uint32_t strb_hash(strb *k) { } static int key_eq(kernel_key *k1, kernel_key *k2) { - return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 && + return (memcmp(k1, k2, KERNEL_KEY_MM) == 0 && strb_eq(&k1->src, &k2->src)); } static int key_hash(kernel_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); - XXH32_update(&state, k->bin_id, 64); + XXH32_update(&state, k, KERNEL_KEY_MM); XXH32_update(&state, k->src.s, k->src.l); return XXH32_digest(&state); } static int key_write(strb *res, kernel_key *k) { - strb_appendn(res, k->bin_id, 64); + strb_appendn(res, (const char *)k, KERNEL_KEY_MM); strb_appendb(res, &k->src); return strb_error(res); } static kernel_key *key_read(const strb *b) { kernel_key *k; - if (b->l < 64) return NULL; + if (b->l < KERNEL_KEY_MM) return NULL; k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; - if (strb_ensure(&k->src, b->l - 64) != 0) { + if (strb_ensure(&k->src, b->l - KERNEL_KEY_MM) != 0) { strb_clear(&k->src); free(k); return NULL; } - memcpy(k->bin_id, b->s, 64); - strb_appendn(&k->src, b->s+64, b->l-64); + memcpy(k->bin_id, b->s, KERNEL_KEY_MM); + strb_appendn(&k->src, b->s + KERNEL_KEY_MM, b->l - KERNEL_KEY_MM); return k; } @@ -1108,6 +1115,13 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { kernel_key *pk; int err; + memset(&k, 0, sizeof(k)); + k.version = 0; +#ifdef DEBUG + k.debug = 1; +#endif + k.major = ctx->major; + k.minor = ctx->minor; memcpy(k.bin_id, ctx->bin_id, 64); memcpy(&k.src, src, sizeof(strb)); @@ -1131,7 +1145,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { fprintf(stderr, "Error adding kernel to disk cache\n"); return GA_NO_ERROR; } - memcpy(pk->bin_id, k.bin_id, 64); + memcpy(pk, &k, KERNEL_KEY_MM); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { // TODO use better error messages From 9306b9692904fa0481aa757e0cc177b09dd89701 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 5 Apr 2017 13:58:30 -0400 Subject: [PATCH 250/597] Fix bug in key_read and explain KERNEL_KEY_MM. --- src/gpuarray_buffer_cuda.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 894de1a8f3..47bc7ac526 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -59,6 +59,7 @@ typedef struct _kernel_key { strb src; } kernel_key; +/* Size of the kernel_key that we can memcopy to duplicate */ #define KERNEL_KEY_MM (sizeof(kernel_key) - sizeof(strb)) static void key_free(cache_key_t _k) { @@ -100,12 +101,16 @@ static kernel_key *key_read(const strb *b) { if (b->l < KERNEL_KEY_MM) return NULL; k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; + memcpy(k, b->s, KERNEL_KEY_MM); + if (k->version != 0) { + free(k); + return NULL; + } if (strb_ensure(&k->src, b->l - KERNEL_KEY_MM) != 0) { strb_clear(&k->src); free(k); return NULL; } - memcpy(k->bin_id, b->s, KERNEL_KEY_MM); strb_appendn(&k->src, b->s + KERNEL_KEY_MM, b->l - KERNEL_KEY_MM); return k; } From 90120afd9b1506f3c54412879341ff58f4ac6e68 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 6 Apr 2017 14:41:18 -0400 Subject: [PATCH 251/597] Changes for release 0.6.3 --- doc/conf.py | 2 +- release.txt | 8 ++++++++ setup.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 release.txt diff --git a/doc/conf.py b/doc/conf.py index 0b658a26dc..00083fdc09 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -51,7 +51,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.2' +release = '0.6.3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/release.txt b/release.txt new file mode 100644 index 0000000000..7daf509762 --- /dev/null +++ b/release.txt @@ -0,0 +1,8 @@ +Release process: +- Make sure you are on the proper release branch +- Update the version in setup.py +- Update the version in doc/conf.py +- Commit the changes with message "Changes for release X.Y.Z" +- Push to master +- Add a release on github with a tag in the form of 'vX.Y.X' + - Make note of the major changes since the last release diff --git a/setup.py b/setup.py index b2f6d22b90..20f0c95921 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ MAJOR = 0 MINOR = 6 PATCH = 3 -SUFFIX = '.dev0' # include the '.' +SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: From 37b9d94db67b1b5ed14088ce6ed6049c694d3ac6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 27 Jan 2017 13:19:44 -0500 Subject: [PATCH 252/597] Add an error struct to pass around error strings. --- src/gpuarray_buffer_collectives.c | 2 +- src/gpuarray_buffer_cuda.c | 7 ++++++ src/gpuarray_collectives_cuda_nccl.c | 33 ++++++++++++-------------- src/private.h | 3 ++- src/util/CMakeLists.txt | 1 + src/util/error.c | 35 ++++++++++++++++++++++++++++ src/util/error.h | 19 +++++++++++++++ 7 files changed, 80 insertions(+), 20 deletions(-) create mode 100644 src/util/error.c create mode 100644 src/util/error.h diff --git a/src/gpuarray_buffer_collectives.c b/src/gpuarray_buffer_collectives.c index 3bb423307f..8f33d70472 100644 --- a/src/gpuarray_buffer_collectives.c +++ b/src/gpuarray_buffer_collectives.c @@ -23,7 +23,7 @@ void gpucomm_free(gpucomm* comm) { const char* gpucomm_error(gpucontext* ctx) { if (ctx->comm_ops != NULL) - return ctx->error_msg; + return ctx->error->msg; return "No collective ops available, API error. Is a collectives library " "installed?"; } diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 47bc7ac526..3268c15528 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -201,6 +201,10 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { res->major = major; res->minor = minor; res->freeblocks = NULL; + if (error_alloc(&res->msg)) { + error_sets(global_ctx, GA_SYS_ERROR, "Could not create error context"); + goto fail_errmsg; + } if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { goto fail_stream; } @@ -283,6 +287,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { fail_mem_stream: cuStreamDestroy(res->s); fail_stream: + error_free(res->msg); + fail_errmsg: free(res); return NULL; } @@ -319,6 +325,7 @@ static void cuda_free_ctx(cuda_context *ctx) { cache_destroy(ctx->kernel_cache); if (ctx->disk_cache) cache_destroy(ctx->disk_cache); + error_free(ctx->msg); if (!(ctx->flags & DONTFREE)) { cuCtxPushCurrent(ctx->ctx); diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index e382cfa066..2713dd1a75 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -16,14 +16,13 @@ * Execute `cmd` and return appropriate code. Save a describing error message in * context. */ -#define NCCL_CHKFAIL(ctx, cmd) \ - do { \ - ncclResult_t nccl_err = (cmd); \ - if (nccl_err != ncclSuccess) { \ - (ctx)->error_msg = ncclGetErrorString(nccl_err); \ - return GA_COMM_ERROR; \ - } \ - return GA_NO_ERROR; \ +#define NCCL_CHKFAIL(ctx, cmd) \ + do { \ + ncclResult_t nccl_err = (cmd); \ + if (nccl_err != ncclSuccess) { \ + return error_sets((ctx)->msg, GA_COMM_ERROR, ncclGetErrorString(nccl_err)); \ + } \ + return GA_NO_ERROR; \ } while (0) /** @@ -31,14 +30,13 @@ * context. Exit from context and return \ref GA_COMM_ERROR if nccl does not * succeed. */ -#define NCCL_EXIT_ON_ERROR(ctx, cmd) \ - do { \ - ncclResult_t nccl_err = (cmd); \ - if (nccl_err != ncclSuccess) { \ - cuda_exit((ctx)); \ - (ctx)->error_msg = ncclGetErrorString(nccl_err); \ - return GA_COMM_ERROR; \ - } \ +#define NCCL_EXIT_ON_ERROR(ctx, cmd) \ + do { \ + ncclResult_t nccl_err = (cmd); \ + if (nccl_err != ncclSuccess) { \ + cuda_exit((ctx)); \ + return error_sets((ctx)->msg, GA_COMM_ERROR, ncclGetErrorString(nccl_err)); \ + } \ } while (0) //!< Link wrapped cuda core operations @@ -108,8 +106,7 @@ static int comm_new(gpucomm** comm_ptr, gpucontext* ctx, if (nccl_err != ncclSuccess) { *comm_ptr = NULL; // Set to NULL if failed comm_clear(comm); - ctx->error_msg = ncclGetErrorString(nccl_err); - return GA_COMM_ERROR; + return error_sets(ctx->msg, GA_COMM_ERROR, ncclGetErrorString(nccl_err)); } *comm_ptr = comm; return GA_NO_ERROR; diff --git a/src/private.h b/src/private.h index 820ebb6287..e569118b09 100644 --- a/src/private.h +++ b/src/private.h @@ -17,6 +17,7 @@ #include #include "util/strb.h" +#include "util/error.h" #include "cache.h" #ifdef __cplusplus @@ -44,7 +45,7 @@ typedef struct _gpuarray_comm_ops gpuarray_comm_ops; const gpuarray_blas_ops *blas_ops; \ const gpuarray_comm_ops *comm_ops; \ void *blas_handle; \ - const char* error_msg; \ + error* msg; \ unsigned int refcnt; \ int flags; \ struct _gpudata *errbuf; \ diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 5c21cc3e90..de4987538f 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -1,5 +1,6 @@ set_rel(UTIL_SRC strb.c +error.c xxhash.c integerfactoring.c skein.c diff --git a/src/util/error.c b/src/util/error.c new file mode 100644 index 0000000000..99bdaf5830 --- /dev/null +++ b/src/util/error.c @@ -0,0 +1,35 @@ +#include +#include + +#include "error.h" + +static error _global_ctx = {}; +error *global_ctx = &_global_ctx; + +int error_alloc(error **_ctx) { + error *ctx; + ctx = calloc(sizeof(error), 1); + if (ctx == NULL) return -1; + *_ctx = ctx; + return 0; +} + +void error_free(error *ctx) { + free(ctx); +} + +int error_setall(error *ctx, int code, const char *msg) { + ctx->code = code; + strlcpy(ctx->msg, msg, MSGBUF_LEN); + return code; +} + +int error_fmt(error *ctx, int code, const char *fmt, ...) { + va_arg ap; + + ctx->code = code; + va_start(ap, fmt); + vsnprintf(ctx->msg, MSGBUF_LEN, fmt, ap); + va_end(ap); + return code; +} diff --git a/src/util/error.h b/src/util/error.h new file mode 100644 index 0000000000..b2d3c98f15 --- /dev/null +++ b/src/util/error.h @@ -0,0 +1,19 @@ +#ifndef UTIL_ERROR_H +#define UTIL_ERROR_H + +/* 1024 - 4 for the int that goes after */ +#define ERROR_MSGBUF_LEN 1020 + +typedef struct _error { + char msg[MSGBUF_LEN]; + int code; +} error; + +int error_alloc(error **ctx); +void error_free(error *ctx); +int error_set(error *ctx, int code, const char *msg); +int error_fmt(error *ctx, int code, const char *fmt, ...); + +extern error *global_ctx; + +#endif From 59a389a969abb12ad3c6ac96ea5e436e1d602612 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Mar 2017 12:36:04 -0400 Subject: [PATCH 253/597] Start adding error messages in the cuda backend. --- src/gpuarray_buffer_cuda.c | 17 +++++++++++------ src/loaders/dyn_load.c | 38 ++++++++++++++++++++++++++------------ src/loaders/libcuda.c | 21 +++++++++++---------- src/loaders/libcuda.h | 4 +++- src/util/error.c | 30 +++++++++++++++--------------- src/util/error.h | 10 +++++----- 6 files changed, 71 insertions(+), 49 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 3268c15528..8cbdfa14f8 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -132,19 +132,24 @@ static int major = -1; static int minor = -1; static int setup_lib(void) { int res, tmp; + const char *name, *msg; const char *ver; + if (!setup_done) { - res = load_libcuda(); + res = load_libcuda(global_err); if (res != GA_NO_ERROR) return res; err = cuInit(0); - if (err != CUDA_SUCCESS) - return GA_IMPL_ERROR; + if (err != CUDA_SUCCESS) { + cuGetErrorName(err, *name); + cuGetErrorString(err, *msg); + return error_fmt(global_err, GA_IMPL_ERROR, "cuInit: %s: %s", name, msg); + } ver = getenv("GPUARRAY_CUDA_VERSION"); if (ver == NULL || strlen(ver) != 2) { err = cuDriverGetVersion(&tmp); if (err != CUDA_SUCCESS) - return GA_IMPL_ERROR; + return error_set(global_err, GA_IMPL_ERROR, "cuDriverGetVersion failed"); major = tmp / 1000; minor = (tmp / 10) % 10; } else { @@ -152,7 +157,7 @@ static int setup_lib(void) { minor = ver[1] - '0'; } if (major > 9 || major < 0 || minor > 9 || minor < 0) - return GA_VALUE_ERROR; + return error_fmt(global_err, GA_VALUE_ERROR, "Invalid cuda version: %d.%d", major, minor); res = load_libnvrtc(major, minor); if (res != GA_NO_ERROR) return res; @@ -202,7 +207,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { res->minor = minor; res->freeblocks = NULL; if (error_alloc(&res->msg)) { - error_sets(global_ctx, GA_SYS_ERROR, "Could not create error context"); + error_sets(global_err, GA_SYS_ERROR, "Could not create error context"); goto fail_errmsg; } if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index a532a47422..a8fbd22ea1 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -1,4 +1,5 @@ #include "dyn_load.h" +#include "util/error.h" #if defined(__unix__) || defined(__APPLE__) @@ -8,21 +9,17 @@ #include #include -void *ga_load_library(const char *name) { +void *ga_load_library(const char *name, error *e) { void *res = dlopen(name, RTLD_LAZY|RTLD_LOCAL); -#ifdef DEBUG if (res == NULL) - warn("dlopen: %s", name); -#endif + error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": %s", name, dlerror()); return res; } void *ga_func_ptr(void *h, const char *name) { void *res = dlsym(h, name); -#ifdef DEBUG if (res == NULL) - warn("dlsym: %s", name); -#endif + error_fmt(e, GA_LOAD_ERROR, "Could not find synbol \"%s\": %s", name, dlerror()); return res; } @@ -30,14 +27,31 @@ void *ga_func_ptr(void *h, const char *name) { /* Should be windows */ #include -#pragma comment(lib,"Version.lib") -void *ga_load_library(const char *name) { - return LoadLibrary(name); +static inline void error_win(error *e) { + char msgbuf[512]; + DWORD err = GetLastError(); + DWORD len = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM| + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, 0, msgbuf, 512, NULL); + if (len == 0) + error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": error code %X", name, err); + else + error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\": %s", name, msgbuf); } -void *ga_func_ptr(void *h, const char *name) { - return (void *)GetProcAddress(h, name); +void *ga_load_library(const char *name, error *e) { + void *res = LoadLibrary(name); + if (res == NULL) + error_win(e); + return res; +} + +void *ga_func_ptr(void *h, const char *name, error *e) { + void *res = (void *)GetProcAddress(h, name); + if (res == NULL) + error_win(e); + return res; } #endif diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 47d0806e0a..9f49ad9bc0 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -4,6 +4,7 @@ #include "libcuda.h" #include "dyn_load.h" #include "gpuarray/error.h" +#include "util/error.h" /* This code is inspired from the dynamic loading code in the samples */ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) @@ -26,21 +27,21 @@ static char libname[] = "libcuda.so"; #define STRINGIFY(X) #X -#define DEF_PROC(name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC(name, args) \ + name = (t##name *)ga_func_ptr(lib, #name, e); \ + if (name == NULL) { \ + return e->code; \ } -#define DEF_PROC_V2(name, args) \ - name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2)); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC_V2(name, args) \ + name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2), e); \ + if (name == NULL) { \ + return e->code; \ } static int loaded = 0; -int load_libcuda(void) { +int load_libcuda(error *e) { void *lib; if (loaded) @@ -48,7 +49,7 @@ int load_libcuda(void) { lib = ga_load_library(libname); if (lib == NULL) - return GA_LOAD_ERROR; + return error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\"", libname); #include "libcuda.fn" diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h index 3a6bf35a8a..54b0f51c61 100644 --- a/src/loaders/libcuda.h +++ b/src/loaders/libcuda.h @@ -1,6 +1,8 @@ #ifndef LOADER_LIBCUDA_H #define LOADER_LIBCUDA_H +#include "util/error.h" + #ifdef _WIN32 #define CUDAAPI __stdcall #else @@ -39,7 +41,7 @@ typedef struct CUipcMemHandle_st { char reserved[CU_IPC_HANDLE_SIZE]; } CUipcMemHandle; -int load_libcuda(void); +int load_libcuda(error *); #define DEF_PROC(name, args) typedef CUresult CUDAAPI t##name args #define DEF_PROC_V2(name, args) DEF_PROC(name, args) diff --git a/src/util/error.c b/src/util/error.c index 99bdaf5830..d1682af855 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -3,33 +3,33 @@ #include "error.h" -static error _global_ctx = {}; -error *global_ctx = &_global_ctx; +static error _global_err = {}; +error *global_err = &_global_err; -int error_alloc(error **_ctx) { - error *ctx; - ctx = calloc(sizeof(error), 1); - if (ctx == NULL) return -1; - *_ctx = ctx; +int error_alloc(error **_e) { + error *e; + e = calloc(sizeof(error), 1); + if (e == NULL) return -1; + *_e = e; return 0; } -void error_free(error *ctx) { - free(ctx); +void error_free(error *e) { + free(e); } -int error_setall(error *ctx, int code, const char *msg) { - ctx->code = code; - strlcpy(ctx->msg, msg, MSGBUF_LEN); +int error_setall(error *e, int code, const char *msg) { + e->code = code; + strlcpy(e->msg, msg, MSGBUF_LEN); return code; } -int error_fmt(error *ctx, int code, const char *fmt, ...) { +int error_fmt(error *e, int code, const char *fmt, ...) { va_arg ap; - ctx->code = code; + e->code = code; va_start(ap, fmt); - vsnprintf(ctx->msg, MSGBUF_LEN, fmt, ap); + vsnprintf(e->msg, MSGBUF_LEN, fmt, ap); va_end(ap); return code; } diff --git a/src/util/error.h b/src/util/error.h index b2d3c98f15..a1dda1f345 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -9,11 +9,11 @@ typedef struct _error { int code; } error; -int error_alloc(error **ctx); -void error_free(error *ctx); -int error_set(error *ctx, int code, const char *msg); -int error_fmt(error *ctx, int code, const char *fmt, ...); +int error_alloc(error **e); +void error_free(error *e); +int error_set(error *e, int code, const char *msg); +int error_fmt(error *e, int code, const char *fmt, ...); -extern error *global_ctx; +extern error *global_err; #endif From 48d5384bf6a8d38f284316b828842a378e44d9fd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Mar 2017 12:55:55 -0400 Subject: [PATCH 254/597] Add error messages to all the loaders. --- src/gpuarray_buffer_cuda.c | 6 +++--- src/loaders/libclblas.c | 14 +++++++------- src/loaders/libclblas.h | 3 ++- src/loaders/libclblast.c | 14 +++++++------- src/loaders/libclblast.h | 3 ++- src/loaders/libcublas.c | 26 +++++++++++++------------- src/loaders/libcublas.h | 5 +++-- src/loaders/libcuda.fn | 1 + src/loaders/libnccl.c | 19 ++++++++++--------- src/loaders/libnccl.h | 4 +++- src/loaders/libnvrtc.c | 18 +++++++++--------- src/loaders/libnvrtc.h | 4 +++- src/loaders/libopencl.c | 14 +++++++------- src/loaders/libopencl.h | 4 +++- src/private.h | 2 +- 15 files changed, 74 insertions(+), 63 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 8cbdfa14f8..eec330df39 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -158,7 +158,7 @@ static int setup_lib(void) { } if (major > 9 || major < 0 || minor > 9 || minor < 0) return error_fmt(global_err, GA_VALUE_ERROR, "Invalid cuda version: %d.%d", major, minor); - res = load_libnvrtc(major, minor); + res = load_libnvrtc(major, minor, global_err); if (res != GA_NO_ERROR) return res; setup_done = 1; @@ -1833,9 +1833,9 @@ static const char *cuda_error(gpucontext *c) { cuda_context *ctx = (cuda_context *)c; const char *errstr = NULL; if (ctx == NULL) - cuGetErrorString(err, &errstr); + return global_err->msg; else - cuGetErrorString(ctx->err, &errstr); + return ctx->msg->msg; return errstr; } diff --git a/src/loaders/libclblas.c b/src/loaders/libclblas.c index cdb17fa39b..4acaeb8120 100644 --- a/src/loaders/libclblas.c +++ b/src/loaders/libclblas.c @@ -20,23 +20,23 @@ static const char libname[] = "libclBLAS.so"; #undef DEF_PROC -#define DEF_PROC(ret, name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name, e); \ + if (name == NULL) { \ + return e->code; \ } static int loaded = 0; -int load_libclblas(void) { +int load_libclblas(error *e) { void *lib; if (loaded) return GA_NO_ERROR; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); if (lib == NULL) - return GA_LOAD_ERROR; + return e->code; #include "libclblas.fn" diff --git a/src/loaders/libclblas.h b/src/loaders/libclblas.h index 44a7cb7911..483f1fa87b 100644 --- a/src/loaders/libclblas.h +++ b/src/loaders/libclblas.h @@ -1,6 +1,7 @@ #ifndef LOADER_LIBCLBLAS_H #define LOADER_LIBCLBLAS_H +#include "util/error.h" #include "libopencl.h" typedef enum clblasOrder_ { @@ -19,7 +20,7 @@ typedef enum clblasStatus_ { /* Rest is not exposed from here */ } clblasStatus; -int load_libclblas(void); +int load_libclblas(error *); #define DEF_PROC(ret, name, args) typedef ret t##name args diff --git a/src/loaders/libclblast.c b/src/loaders/libclblast.c index 1bb4cc9999..759b9cd476 100644 --- a/src/loaders/libclblast.c +++ b/src/loaders/libclblast.c @@ -20,23 +20,23 @@ static const char libname[] = "libclblast.so"; #undef DEF_PROC -#define DEF_PROC(ret, name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name, e); \ + if (name == NULL) { \ + return e->code; \ } static int loaded = 0; -int load_libclblast(void) { +int load_libclblast(error *e) { void *lib; if (loaded) return GA_NO_ERROR; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); if (lib == NULL) - return GA_LOAD_ERROR; + return e->code; #include "libclblast.fn" diff --git a/src/loaders/libclblast.h b/src/loaders/libclblast.h index a507b37d2d..9b60d1e6a8 100644 --- a/src/loaders/libclblast.h +++ b/src/loaders/libclblast.h @@ -1,6 +1,7 @@ #ifndef LOADER_LIBCLBLAST_H #define LOADER_LIBCLBLAST_H +#include "util/error.h" #include "libopencl.h" typedef enum Layout_ { @@ -19,7 +20,7 @@ typedef enum StatusCode_ { /* Rest is not exposed from here */ } StatusCode; -int load_libclblast(void); +int load_libclblast(error *); #define DEF_PROC(ret, name, args) typedef ret t##name args diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index acc425479d..dc4b253ea0 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -16,24 +16,24 @@ #define STRINGIFY(X) #X -#define DEF_PROC(name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC(name, args) \ + name = (t##name *)ga_func_ptr(lib, #name, e); \ + if (name == NULL) { \ + return e->code; \ } #define DEF_PROC_OPT(name, args) \ name = (t##name *)ga_func_ptr(lib, #name); -#define DEF_PROC_V2(name, args) \ - name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2)); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC_V2(name, args) \ + name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2), e); \ + if (name == NULL) { \ + return e->code; \ } static int loaded = 0; -int load_libcublas(int major, int minor) { +int load_libcublas(int major, int minor, error *e) { void *lib; if (loaded) @@ -47,7 +47,7 @@ int load_libcublas(int major, int minor) { libname[9] = DIGITS[major]; libname[10] = DIGITS[minor]; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ @@ -56,14 +56,14 @@ int load_libcublas(int major, int minor) { char libname[] = "/Developer/NVIDIA/CUDA-?.?/lib/libcublas.dylib"; libname[23] = DIGITS[major]; libname[25] = DIGITS[minor]; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); } #else - lib = ga_load_library("libcublas.so"); + lib = ga_load_library("libcublas.so", e); #endif #endif if (lib == NULL) - return GA_LOAD_ERROR; + return e->code; #include "libcublas.fn" diff --git a/src/loaders/libcublas.h b/src/loaders/libcublas.h index 71955e3797..83a6c8030f 100644 --- a/src/loaders/libcublas.h +++ b/src/loaders/libcublas.h @@ -1,6 +1,8 @@ #ifndef LOADER_LIBCUBLAS_H #define LOADER_LIBCUBLAS_H +#include "util/error.h" + #ifdef _WIN32 #define CUBLASWINAPI __stdcall #else @@ -58,8 +60,7 @@ typedef enum { typedef struct cublasContext *cublasHandle_t; - -int load_libcublas(int major, int minor); +int load_libcublas(int major, int minor, error *e); #define DEF_PROC(name, args) typedef cublasStatus_t CUBLASWINAPI t##name args #define DEF_PROC_V2(name, args) DEF_PROC(name, args) diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn index 5bfc890fba..b65cde22b5 100644 --- a/src/loaders/libcuda.fn +++ b/src/loaders/libcuda.fn @@ -1,5 +1,6 @@ DEF_PROC(cuInit, (int flags)); DEF_PROC(cuDriverGetVersion, (int *driverVersion)); +DEF_PROC(cuGetErrorName, (CUresult error, const char **pStr)); DEF_PROC(cuGetErrorString, (CUresult error, const char **pStr)); DEF_PROC(cuDeviceGet, (CUdevice *device, int ordinal)); diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c index 341be93451..4ef247d117 100644 --- a/src/loaders/libnccl.c +++ b/src/loaders/libnccl.c @@ -13,29 +13,30 @@ #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(__APPLE__) /* As far as we know, nccl is not available or buildable on platforms other than linux */ -int load_libnccl(void) { - return GA_UNSUPPORTED_ERROR; +int load_libnccl(error *e) { + return error_set(e, GA_UNSUPPORTED_ERROR, + "NCCL is not available on plaforms other than linux."); } #else /* Unix */ static const char libname[] = "libnccl.so"; -#define DEF_PROC(ret, name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name, e); \ + if (name == NULL) { \ + return e->code; \ } static int loaded = 0; -int load_libnccl(void) { +int load_libnccl(error *e) { void *lib; if (loaded) return GA_NO_ERROR; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); if (lib == NULL) - return GA_LOAD_ERROR; + return e->code; #include "libnccl.fn" diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h index d8aac387e3..8efb694b42 100644 --- a/src/loaders/libnccl.h +++ b/src/loaders/libnccl.h @@ -1,6 +1,8 @@ #ifndef LOADER_LIBNCCL_H #define LOADER_LIBNCCL_H +#include "util/error.h" + typedef struct CUstream_st *cudaStream_t; typedef struct ncclComm* ncclComm_t; @@ -25,7 +27,7 @@ typedef enum { ncclChar = 0, ncclUint64 = 6, nccl_NUM_TYPES = 7 } ncclDataType_t; -int load_libnccl(void); +int load_libnccl(error *e); #define DEF_PROC(ret, name, args) typedef ret t##name args diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c index 5d967f8a98..bd84aadedf 100644 --- a/src/loaders/libnvrtc.c +++ b/src/loaders/libnvrtc.c @@ -11,15 +11,15 @@ #undef DEF_PROC -#define DEF_PROC(name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC(name, args) \ + name = (t##name *)ga_func_ptr(lib, #name, e); \ + if (name == NULL) { \ + return e->code; \ } static int loaded = 0; -int load_libnvrtc(int major, int minor) { +int load_libnvrtc(int major, int minor, error *e) { void *lib; if (loaded) @@ -33,7 +33,7 @@ int load_libnvrtc(int major, int minor) { libname[8] = DIGITS[major]; libname[9] = DIGITS[minor]; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ @@ -43,14 +43,14 @@ int load_libnvrtc(int major, int minor) { char libname[] = "/Developer/NVIDIA/CUDA-?.?/lib/libnvrtc.dylib"; libname[23] = DIGITS[major]; libname[25] = DIGITS[minor]; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); } #else - lib = ga_load_library("libnvrtc.so"); + lib = ga_load_library("libnvrtc.so", e); #endif #endif if (lib == NULL) - return GA_LOAD_ERROR; + return e->code; #include "libnvrtc.fn" diff --git a/src/loaders/libnvrtc.h b/src/loaders/libnvrtc.h index e06aa45042..2395a677ea 100644 --- a/src/loaders/libnvrtc.h +++ b/src/loaders/libnvrtc.h @@ -1,13 +1,15 @@ #ifndef LOADER_LIBNVRTC_H #define LOADER_LIBNVRTC_H +#include "util/error.h" + typedef enum { NVRTC_SUCCESS = 0, } nvrtcResult; typedef struct _nvrtcProgram *nvrtcProgram; -int load_libnvrtc(int major, int minor); +int load_libnvrtc(int major, int minor, error *e); #define DEF_PROC(name, args) typedef nvrtcResult t##name args diff --git a/src/loaders/libopencl.c b/src/loaders/libopencl.c index 1994fc38aa..7c38abee32 100644 --- a/src/loaders/libopencl.c +++ b/src/loaders/libopencl.c @@ -20,23 +20,23 @@ static char libname[] = "libOpenCL.so"; #undef DEF_PROC -#define DEF_PROC(ret, name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); \ - if (name == NULL) { \ - return GA_LOAD_ERROR; \ +#define DEF_PROC(ret, name, args) \ + name = (t##name *)ga_func_ptr(lib, #name, e); \ + if (name == NULL) { \ + return e->code; \ } static int loaded = 0; -int load_libopencl(void) { +int load_libopencl(error *e) { void *lib; if (loaded) return GA_NO_ERROR; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); if (lib == NULL) - return GA_LOAD_ERROR; + return e->code; #include "libopencl.fn" diff --git a/src/loaders/libopencl.h b/src/loaders/libopencl.h index fd2a04564e..f2a5727cf6 100644 --- a/src/loaders/libopencl.h +++ b/src/loaders/libopencl.h @@ -1,6 +1,8 @@ #ifndef LOADER_LIBOPENCL_H #define LOADER_LIBOPENCL_H +#include "util/error.h" + #if defined(_WIN32) #define CL_API_CALL __stdcall #define CL_CALLBACK __stdcall @@ -54,7 +56,7 @@ typedef cl_uint cl_program_build_info; typedef cl_uint cl_kernel_info; typedef cl_uint cl_kernel_work_group_info; -int load_libopencl(void); +int load_libopencl(error *); #define DEF_PROC(ret, name, args) typedef ret CL_API_CALL t##name args diff --git a/src/private.h b/src/private.h index e569118b09..b1bd45f779 100644 --- a/src/private.h +++ b/src/private.h @@ -45,7 +45,7 @@ typedef struct _gpuarray_comm_ops gpuarray_comm_ops; const gpuarray_blas_ops *blas_ops; \ const gpuarray_comm_ops *comm_ops; \ void *blas_handle; \ - error* msg; \ + error *msg; \ unsigned int refcnt; \ int flags; \ struct _gpudata *errbuf; \ From 3f1e06d8146b290b494e73fe952bced936fab171 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Mar 2017 18:26:33 -0400 Subject: [PATCH 255/597] Finish the error messages for the cuda backend. --- src/cache.h | 7 +- src/cache/lru.c | 13 +- src/cache/twoq.c | 18 +- src/gpuarray_buffer.c | 17 +- src/gpuarray_buffer_cuda.c | 630 ++++++++++++++++--------------------- src/private.h | 8 +- src/private_cuda.h | 22 +- 7 files changed, 329 insertions(+), 386 deletions(-) diff --git a/src/cache.h b/src/cache.h index f2e610f3dd..e291aca67b 100644 --- a/src/cache.h +++ b/src/cache.h @@ -5,6 +5,7 @@ #include #include "private_config.h" #include "util/strb.h" +#include "util/error.h" typedef void *cache_key_t; typedef void *cache_value_t; @@ -77,12 +78,14 @@ struct _cache { cache *cache_lru(size_t max_size, size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, - cache_freek_fn kfree, cache_freev_fn vfree); + cache_freek_fn kfree, cache_freev_fn vfree, + error *e); cache *cache_twoq(size_t hot_size, size_t warm_size, size_t cold_size, size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, - cache_freek_fn kfree, cache_freev_fn vfree); + cache_freek_fn kfree, cache_freev_fn vfree, + error *e); cache *cache_disk(const char *dirpath, cache *mem, kwrite_fn kwrite, vwrite_fn vwrite, diff --git a/src/cache/lru.c b/src/cache/lru.c index 597bbea999..cf1b9555db 100644 --- a/src/cache/lru.c +++ b/src/cache/lru.c @@ -127,10 +127,11 @@ static inline size_t roundup2(size_t s) { return s; } -static inline int hash_init(hash *h, size_t size) { +static inline int hash_init(hash *h, size_t size, error *e) { h->nbuckets = roundup2(size + (size/6)); h->keyval = calloc(h->nbuckets, sizeof(*h->keyval)); if (h->keyval == NULL) { + error_sys(e, "calloc"); return -1; } h->size = 0; @@ -276,11 +277,15 @@ static void lru_destroy(cache *_c) { cache *cache_lru(size_t max_size, size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, - cache_freek_fn kfree, cache_freev_fn vfree) { + cache_freek_fn kfree, cache_freev_fn vfree, + error *e) { lru_cache *res = malloc(sizeof(*res)); - if (res == NULL) return NULL; + if (res == NULL) { + error_sys(e, "malloc"); + return NULL; + } - if (hash_init(&res->data, max_size+elasticity)) { + if (hash_init(&res->data, max_size+elasticity, e)) { free(res); return NULL; } diff --git a/src/cache/twoq.c b/src/cache/twoq.c index 0309484f08..7ff85344fc 100644 --- a/src/cache/twoq.c +++ b/src/cache/twoq.c @@ -135,10 +135,11 @@ static inline size_t roundup2(size_t s) { return s; } -static inline int hash_init(hash *h, size_t size) { +static inline int hash_init(hash *h, size_t size, error *e) { h->nbuckets = roundup2(size + (size/6)); h->keyval = calloc(h->nbuckets, sizeof(*h->keyval)); if (h->keyval == NULL) { + error_sys(e, "calloc"); return -1; } h->size = 0; @@ -322,16 +323,21 @@ static void twoq_destroy(cache *_c) { } cache *cache_twoq(size_t hot_size, size_t warm_size, size_t cold_size, - size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, - cache_freek_fn kfree, cache_freev_fn vfree) { + size_t elasticity, cache_eq_fn keq, cache_hash_fn khash, + cache_freek_fn kfree, cache_freev_fn vfree, error *e) { twoq_cache *res; - if (hot_size == 0 || warm_size == 0 || cold_size == 0) + if (hot_size == 0 || warm_size == 0 || cold_size == 0) { + error_set(e, GA_VALUE_ERROR, "cache_twoq: section size is 0"); return NULL; + } res = malloc(sizeof(*res)); - if (res == NULL) return NULL; + if (res == NULL) { + error_sys(e, "malloc"); + return NULL; + } - if (hash_init(&res->data, hot_size+warm_size+cold_size+elasticity)) { + if (hash_init(&res->data, hot_size+warm_size+cold_size+elasticity, e)) { free(res); return NULL; } diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index dee65f130c..072dba9913 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -17,7 +17,7 @@ const gpuarray_buffer_ops *gpuarray_get_ops(const char *name) { return NULL; } -#define FAIL(v, e) { if (ret) *ret = e; return v; } +#define FAIL(v, e) { if (ret) *ret = (e)->code; return v; } int gpu_get_platform_count(const char* name, unsigned int* platcount) { const gpuarray_buffer_ops* ops = gpuarray_get_ops(name); @@ -39,11 +39,9 @@ int gpu_get_device_count(const char* name, unsigned int platform, gpucontext *gpucontext_init(const char *name, int dev, int flags, int *ret) { gpucontext *res; const gpuarray_buffer_ops *ops = gpuarray_get_ops(name); - if (ops == NULL) - FAIL(NULL, GA_INVALID_ERROR); - res = ops->buffer_init(dev, flags, ret); - if (res == NULL) - return NULL; + if (ops == NULL) FAIL(NULL, global_err); + res = ops->buffer_init(dev, flags); + if (res == NULL) FAIL(NULL, global_err); res->ops = ops; if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, (void *)&res->blas_ops) != GA_NO_ERROR) res->blas_ops = NULL; @@ -84,7 +82,9 @@ const char *gpucontext_error(gpucontext *ctx, int err) { gpudata *gpudata_alloc(gpucontext *ctx, size_t sz, void *data, int flags, int *ret) { - return ctx->ops->buffer_alloc(ctx, sz, data, flags, ret); + gpudata *res = ctx->ops->buffer_alloc(ctx, sz, data, flags); + if (res == NULL && ret) *ret = ctx->err->code; + return res; } void gpudata_retain(gpudata *b) { @@ -92,9 +92,8 @@ void gpudata_retain(gpudata *b) { } void gpudata_release(gpudata *b) { - if(b){ + if (b) ((partial_gpudata *)b)->ctx->ops->buffer_release(b); - } } int gpudata_share(gpudata *a, gpudata *b, int *ret) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index eec330df39..202fb39570 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -37,8 +37,6 @@ STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcme */ #define FRAG_SIZE (64) -static CUresult err; - const gpuarray_buffer_ops cuda_ops; static void cuda_freekernel(gpukernel *); @@ -46,7 +44,7 @@ static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *); static int cuda_waits(gpudata *, int, CUstream); static int cuda_records(gpudata *, int, CUstream); -static int detect_arch(const char *prefix, char *ret, CUresult *err); +static int detect_arch(const char *prefix, char *ret, error *e); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); typedef struct _kernel_key { @@ -132,7 +130,6 @@ static int major = -1; static int minor = -1; static int setup_lib(void) { int res, tmp; - const char *name, *msg; const char *ver; if (!setup_done) { @@ -140,11 +137,8 @@ static int setup_lib(void) { if (res != GA_NO_ERROR) return res; err = cuInit(0); - if (err != CUDA_SUCCESS) { - cuGetErrorName(err, *name); - cuGetErrorString(err, *msg); - return error_fmt(global_err, GA_IMPL_ERROR, "cuInit: %s: %s", name, msg); - } + if (err != CUDA_SUCCESS) + return error_cuda(global_err, "cuInit", err); ver = getenv("GPUARRAY_CUDA_VERSION"); if (ver == NULL || strlen(ver) != 2) { err = cuDriverGetVersion(&tmp); @@ -178,7 +172,7 @@ static int cuda_get_device_count(unsigned int platform, GA_CHECK(setup_lib()); err = cuDeviceGetCount(&dv); if (err != CUDA_SUCCESS) - return GA_IMPL_ERROR; + return error_cuda(global_err, "cuDeviceGetCount", err); *devcount = (unsigned int)dv; return GA_NO_ERROR; } @@ -188,6 +182,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { cache *mem_cache; char *cache_path; void *p; + CUresult err; int e; e = setup_lib(); @@ -195,28 +190,30 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { return NULL; res = calloc(1, sizeof(*res)); - if (res == NULL) + if (res == NULL) { + error_sys(global_err, "calloc"); return NULL; + } res->ctx = ctx; res->ops = &cuda_ops; - res->err = CUDA_SUCCESS; res->refcnt = 1; res->flags = flags; res->enter = 0; res->major = major; res->minor = minor; res->freeblocks = NULL; - if (error_alloc(&res->msg)) { + if (error_alloc(&res->err)) { error_sets(global_err, GA_SYS_ERROR, "Could not create error context"); goto fail_errmsg; } - if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) { + if (detect_arch(ARCH_PREFIX, res->bin_id, global_err)) { goto fail_stream; } /* Don't add the nonblocking flags to help usage with other libraries that may do stuff on the NULL stream */ err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { + error_cuda(global_err, "cuStreamCreate", err) goto fail_stream; } if (ISSET(res->flags, GA_CTX_SINGLE_STREAM)) { @@ -226,6 +223,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { libraries that may do stuff on the NULL stream */ err = cuStreamCreate(&res->mem_s, 0); if (err != CUDA_SUCCESS) { + error_cuda(global_err, "cuStreamCreate", err) goto fail_mem_stream; } } @@ -234,8 +232,9 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (cache_eq_fn)strb_eq, (cache_hash_fn)strb_hash, (cache_freek_fn)strb_free, - (cache_freev_fn)cuda_freekernel); + (cache_freev_fn)cuda_freekernel, global_err); if (res->kernel_cache == NULL) + error_cuda(global_err, "cuStreamCreate", err) goto fail_cache; cache_path = getenv("GPUARRAY_CACHE_PATH"); @@ -268,14 +267,16 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { + error_cuda(e, "cuMemAllocHost", err); goto fail_errbuf; } memset(p, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); - res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16); + res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16, e); if (res->errbuf == NULL) { - err = res->err; + /* Copy the error from the context since we are getting rid of it */ + error_set(global_err, res->err->code, res->err->msg); goto fail_end; } res->errbuf->flags |= CUDA_MAPPED_PTR; @@ -292,7 +293,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { fail_mem_stream: cuStreamDestroy(res->s); fail_stream: - error_free(res->msg); + error_free(res->err); fail_errmsg: free(res); return NULL; @@ -330,7 +331,7 @@ static void cuda_free_ctx(cuda_context *ctx) { cache_destroy(ctx->kernel_cache); if (ctx->disk_cache) cache_destroy(ctx->disk_cache); - error_free(ctx->msg); + error_free(ctx->err); if (!(ctx->flags & DONTFREE)) { cuCtxPushCurrent(ctx->ctx); @@ -365,10 +366,14 @@ void cuda_exit(cuda_context *ctx) { static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) { gpudata *res; + CUresult err; int fl = CU_EVENT_DISABLE_TIMING; res = malloc(sizeof(*res)); - if (res == NULL) return NULL; + if (res == NULL) { + error_sys(ctx->err, "malloc"); + return NULL; + } res->refcnt = 0; res->sz = size; @@ -380,15 +385,17 @@ static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) { if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; - ctx->err = cuEventCreate(&res->rev, fl); - if (ctx->err != CUDA_SUCCESS) { + err = cuEventCreate(&res->rev, fl); + if (err != CUDA_SUCCESS) { + error_cuda(ctx->err, "cuEventCreate", err); cuda_exit(ctx); free(res); return NULL; } - ctx->err = cuEventCreate(&res->wev, fl); - if (ctx->err != CUDA_SUCCESS) { + err = cuEventCreate(&res->wev, fl); + if (err != CUDA_SUCCESS) { + error_cuda(ctx->err, "cuEventCreate", err); cuEventDestroy(res->rev); cuda_exit(ctx); free(res); @@ -419,8 +426,11 @@ gpudata *cuda_make_buf(cuda_context *ctx, CUdeviceptr p, size_t sz) { size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; } -#define FAIL(v, e) { if (ret) *ret = e; return v; } -#define CHKFAIL(v) if (err != CUDA_SUCCESS) FAIL(v, GA_IMPL_ERROR) +#define CHKFAIL(n, v) \ + if (err != CUDA_SUCCESS) { \ + error_cuda(e, n, err); \ + return v; \ + } static const char CUDA_PREAMBLE[] = "#define local_barrier() __syncthreads()\n" @@ -474,71 +484,80 @@ static const char CUDA_PREAMBLE[] = /* XXX: add complex, quads, longlong */ /* XXX: add vector types */ -static cuda_context *do_init(CUdevice dev, int flags, int *ret) { +static cuda_context *do_init(CUdevice dev, int flags, error *e) { cuda_context *res; CUcontext ctx; + CUresult err; unsigned int fl = CU_CTX_SCHED_AUTO; unsigned int cur_fl; int act; int i; - CHKFAIL(NULL); if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); - CHKFAIL(NULL); - if (i != 1) - FAIL(NULL, GA_UNSUPPORTED_ERROR); + CHKFAIL("cuDeviceGetAttribute", NULL); + if (i != 1) { + error_set(e, GA_UNSUPPORTED_ERROR, "device does not support unified addressing"); + return NULL; + } err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act); - CHKFAIL(NULL); + CHKFAIL("cuDevicePrimaryCtxGetState", NULL); if (act == 1) { - if ((cur_fl & fl) != fl) - FAIL(NULL, GA_INVALID_ERROR); + if ((cur_fl & fl) != fl) { + error_set(e, GA_INVALID_ERROR, "device is already active and has unsupported flags"); + return NULL; + } } else { err = cuDevicePrimaryCtxSetFlags(dev, fl); - CHKFAIL(NULL); + CHKFAIL("cuDevicePrimaryCtxSetFlags", NULL); } err = cuDevicePrimaryCtxRetain(&ctx, dev); - CHKFAIL(NULL); + CHKFAIL("cuDevicePrimaryCtxRetain", NULL); err = cuCtxPushCurrent(ctx); - CHKFAIL(NULL); + CHKFAIL("cuCtxPushCurrent", NULL); res = cuda_make_ctx(ctx, flags); if (res == NULL) { cuDevicePrimaryCtxRelease(dev); - FAIL(NULL, GA_IMPL_ERROR); + if (e != global_err) + error_set(e, global_err->code, global_err->msg); + return NULL; } /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); - return res; + return res; } -static gpucontext *cuda_init(int ord, int flags, int *ret) { + +static gpucontext *cuda_init(int ord, int flags) { CUdevice dev; cuda_context *res; + CUresult err; int r; r = setup_lib(); if (r != GA_NO_ERROR) { - FAIL(NULL, r); + return NULL; } if (ord == -1) { int i, c; err = cuDeviceGetCount(&c); - CHKFAIL(NULL); + CHKFAIL("cuDeviceGetCount", NULL); for (i = 0; i < c; i++) { err = cuDeviceGet(&dev, i); - CHKFAIL(NULL); + CHKFAIL("cuDeviceGet", NULL); res = do_init(dev, flags, NULL); if (res != NULL) return (gpucontext *)res; } - FAIL(NULL, GA_NODEV_ERROR); + error_set(global_err, GA_NODEV_ERROR, "No cuda device available"); + return NULL; } else { err = cuDeviceGet(&dev, ord); - CHKFAIL(NULL); + CHKFAIL("cuDeviceGet", NULL); return (gpucontext *)do_init(dev, flags, ret); } } @@ -568,7 +587,7 @@ static size_t largest_size(cuda_context *ctx) { gpudata *temp; size_t sz, dummy; cuda_enter(ctx); - ctx->err = cuMemGetInfo(&sz, &dummy); + cuMemGetInfo(&sz, &dummy); cuda_exit(ctx); /* We guess that we can allocate at least a quarter of the free size in a single block. This might be wrong though. */ @@ -588,6 +607,8 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, size_t size) { CUdeviceptr ptr; gpudata *next; + CUresult err; + *prev = NULL; if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE)) @@ -595,10 +616,10 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, cuda_enter(ctx); - ctx->err = cuMemAlloc(&ptr, size); - if (ctx->err != CUDA_SUCCESS) { + err = cuMemAlloc(&ptr, size); + if (err != CUDA_SUCCESS) { cuda_exit(ctx); - return GA_IMPL_ERROR; + return error_cuda(ctx->err, "cuMemAlloc", err); } *res = new_gpudata(ctx, ptr, size); @@ -607,7 +628,7 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, if (*res == NULL) { cuMemFree(ptr); - return GA_MEMORY_ERROR; + return ctx->err->code; } (*res)->flags |= CUDA_HEAD_ALLOC; @@ -643,7 +664,7 @@ static int extract(gpudata *curr, gpudata *prev, size_t size) { } else { split = new_gpudata(curr->ctx, curr->ptr + size, remaining); if (split == NULL) - return GA_MEMORY_ERROR; + return curr->ctx->err->code; /* Make sure the chain keeps going */ split->next = curr->next; curr->next = NULL; @@ -669,19 +690,26 @@ static inline size_t roundup(size_t s, size_t m) { return ((s + (m - 1)) / m) * m; } -static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags, - int *ret) { +static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags) { gpudata *res = NULL, *prev = NULL; cuda_context *ctx = (cuda_context *)c; size_t asize; - int err; - if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR); + if ((flags & GA_BUFFER_INIT) && data == NULL) { + error_set(ctx->err, GA_VALUE_ERROR, "Requested buffer initialisation but no data given"); + return NULL; + } if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == - (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR); + (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) { + error_set(ctx->err, GA_VALUE_ERROR, "Invalid flags combinaison WRITE_ONLY and READ_ONLY"); + return NULL; + } /* TODO: figure out how to make this work */ - if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR); + if (flags & GA_BUFFER_HOST) { + error_set(ctx->err, GA_DEVSUP_ERROR, "Host mapped allocations are not supported yet"); + return NULL; + } /* We don't want to manage really small allocations so we round up * to a multiple of FRAG_SIZE. This also ensures that if we split a @@ -694,25 +722,21 @@ static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags, asize = size; } - if (res == NULL) { - err = allocate(ctx, &res, &prev, asize); - if (err != GA_NO_ERROR) - FAIL(NULL, err); - } + if (res == NULL && allocate(ctx, &res, &prev, asize) != GA_NO_ERROR) + return NULL; + + if (extract(res, prev, asize) != GA_NO_ERROR) + return NULL; - err = extract(res, prev, asize); - if (err != GA_NO_ERROR) - FAIL(NULL, err); /* It's out of the freelist, so add a ref */ res->ctx->refcnt++; /* We consider this buffer allocated and ready to go */ res->refcnt = 1; if (flags & GA_BUFFER_INIT) { - err = cuda_write(res, 0, data, size); - if (err != GA_NO_ERROR) { + if (cuda_write(res, 0, data, size) != GA_NO_ERROR) { cuda_free(res); - FAIL(NULL, err); + return NULL; } } @@ -732,16 +756,18 @@ gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle *h, size_t sz) CUdeviceptr p; cuda_context *ctx = (cuda_context *)c; gpudata *d = NULL; + CUresult err; cuda_enter(ctx); - ctx->err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)h), - CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); - if (ctx->err == CUDA_SUCCESS) { - d = cuda_make_buf(ctx, p, sz); - if (d != NULL) - d->flags |= CUDA_IPC_MEMORY; + err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)h), + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + if (err != CUDA_SUCCESS) { + cuda_exit(ctx); + return error_cuda(ctx->err, "cuIpcOpenMemHandle", err); } - cuda_exit(ctx); + d = cuda_make_buf(ctx, p, sz); + if (d != NULL) + d->flags |= CUDA_IPC_MEMORY; return d; } @@ -883,12 +909,15 @@ static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src, int res = GA_NO_ERROR; ASSERT_BUF(dst); ASSERT_BUF(src); - if (src->ctx != dst->ctx) return GA_VALUE_ERROR; + if (src->ctx != dst->ctx) return error_set(ctx->err, GA_VALUE_ERROR, + "Cannot move between contexts"); if (sz == 0) return GA_NO_ERROR; - if ((dst->sz - dstoff) < sz || (src->sz - srcoff) < sz) - return GA_VALUE_ERROR; + if ((dst->sz - dstoff) < sz) + return error_set(ctx->err, GA_VALUE_ERROR, "Destination is smaller than requested transfer size"); + if ((src->sz - srcoff) < sz) + return error_set(ctx->err, GA_VALUE_ERROR, "Source is smaller than requested transfer size"); cuda_enter(ctx); @@ -917,19 +946,17 @@ static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { if (sz == 0) return GA_NO_ERROR; if ((src->sz - srcoff) < sz) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "source is smaller than the read size"); cuda_enter(ctx); if (src->flags & CUDA_MAPPED_PTR) { + if (ISSET(ctx->flags, GA_CTX_SINGLE_STREAM)) - ctx->err = cuStreamSynchronize(ctx->s); + CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s)); else - ctx->err = cuEventSynchronize(src->wev); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } + CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(src->wev)); + memcpy(dst, (void *)(src->ptr + srcoff), sz); } else { GA_CUDA_EXIT_ON_ERROR(ctx, @@ -954,19 +981,17 @@ static int cuda_write(gpudata *dst, size_t dstoff, const void *src, if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Destination is smaller than the write size"); cuda_enter(ctx); if (dst->flags & CUDA_MAPPED_PTR) { + if (ISSET(ctx->flags, GA_CTX_SINGLE_STREAM)) - ctx->err = cuStreamSynchronize(ctx->s); + CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s)); else - ctx->err = cuEventSynchronize(dst->rev); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } + CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(dst->rev)); + memcpy((void *)(dst->ptr + dstoff), src, sz); } else { GA_CUDA_EXIT_ON_ERROR(ctx, @@ -1003,32 +1028,41 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) { return GA_NO_ERROR; } -static CUresult get_cc(CUdevice dev, int *maj, int *min) { - CUresult lerr; - lerr = cuDeviceGetAttribute(maj, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - dev); - if (lerr != CUDA_SUCCESS) - return lerr; - return cuDeviceGetAttribute(min, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - dev); +static int get_cc(CUdevice dev, int *maj, int *min, error *e) { + CUresult err; + err = cuDeviceGetAttribute(maj, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + dev); + if (err != CUDA_SUCCESS) + return error_cuda(e, "cuDeviceGetAttribute", err); + err = cuDeviceGetAttribute(min, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + dev); + if (err != CUDA_SUCCESS) + return error_cuda(e, "cuDeviceGetAttribute", err); + return GA_NO_ERROR; } -static int detect_arch(const char *prefix, char *ret, CUresult *err) { +static int detect_arch(const char *prefix, char *ret, error *e) { CUdevice dev; + CUresult err; int major, minor; int res; size_t sz = strlen(prefix) + 3; - *err = cuCtxGetDevice(&dev); - if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR; - *err = get_cc(dev, &major, &minor); - if (*err != CUDA_SUCCESS) return GA_IMPL_ERROR; + err = cuCtxGetDevice(&dev); + if (err != CUDA_SUCCESS) return error_cuda(e, "cuCtxGetDevice", err); + GA_CHECK(get_cc(dev, &major, &minor, e)); res = snprintf(ret, sz, "%s%d%d", prefix, major, minor); - if (res == -1 || res > (ssize_t)sz) return GA_UNSUPPORTED_ERROR; + if (res == -1) return error_sys(e, "snprintf"); + if (res > (ssize_t)sz) return error_set(e, GA_UNSUPPORTED_ERROR, + "detect_arch: arch id is too large"); return GA_NO_ERROR; } +static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) { + return error_fmt(e, GA_IMPL_ERROR, "%s: %s", msg, nvrtcGetErrorString(err)); +} + static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { nvrtcProgram prog; size_t buflen; @@ -1041,7 +1075,8 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { opts[1] = ctx->bin_id; err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL); - if (err != NVRTC_SUCCESS) return GA_SYS_ERROR; + if (err != NVRTC_SUCCESS) + return error_nvrtc(ctx->err, "nvrtcCreateProgram", err); err = nvrtcCompileProgram(prog, #ifdef DEBUG @@ -1050,6 +1085,8 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { 2, #endif opts); + + /* Get the log before handling the error */ if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) { strb_appends(log, "NVRTC compile log::\n"); if (strb_ensure(log, buflen) == 0) @@ -1058,24 +1095,32 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { strb_appendc(log, '\n'); } + if (err != NVRTC_SUCCESS) { + nvrtcDestroyProgram(&prog); + return error_nvrtc(ctx->err, "nvrtcCompileProgram", err); + } + err = nvrtcGetPTXSize(prog, &buflen); - if (err != NVRTC_SUCCESS) goto end; + if (err != NVRTC_SUCCESS) { + nvrtcDestroyProgram(&prog); + return error_nvrtc(ctx->err, "nvrtcGetPTXSize", err); + } if (strb_ensure(ptx, buflen) == 0) { err = nvrtcGetPTX(prog, ptx->s+ptx->l); - if (err == NVRTC_SUCCESS) ptx->l += buflen; + if (err != NVRTC_SUCCESS) { + ptx->l += buflen; + nvrtcDestroyProgram(&prog); + return error_nvrtc(ctx->err, "nvrtcGetPTX", err); + } } -end: - nvrtcDestroyProgram(&prog); - if (err != NVRTC_SUCCESS) - return GA_SYS_ERROR; return GA_NO_ERROR; } static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { - char info_log[2048]; - char error_log[2048]; + char info_log[2048] = ""; + char error_log[2048] = ""; void *out; size_t out_size; CUlinkState st; @@ -1097,21 +1142,22 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { (void *)0, (void *)0, (void *)0 #endif }; - int err = GA_NO_ERROR; + CUresult err; + int res = GA_NO_ERROR; - ctx->err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), + err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), cujit_opts, cujit_opt_vals, &st); if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; + return error_cuda(ctx->err, "cuLinkCreate", err); ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, "kernel code", 0, NULL, NULL); if (ctx->err != CUDA_SUCCESS) { - err = GA_IMPL_ERROR; + res = error_cuda(ctx->err, "cuLinkAddData", err); goto out; } ctx->err = cuLinkComplete(st, &out, &out_size); if (ctx->err != CUDA_SUCCESS) { - err = GA_IMPL_ERROR; + res = error_cuda(ctx->err, "cuLinkComplete", err); goto out; } strb_appendn(bin, out, out_size); @@ -1122,7 +1168,7 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { strb_appends(log, "\nLink error log::\n"); strb_appends(log, error_log); strb_appendc(log, '\n'); - return err; + return res; } static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { @@ -1151,10 +1197,10 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { } } - err = call_compiler(ctx, src, &ptx, log); - if (err != GA_NO_ERROR) return err; - err = make_bin(ctx, &ptx, bin, log); - if (err != GA_NO_ERROR) return err; + GA_CHECK(call_compiler(ctx, src, &ptx, log)); + + GA_CHECK(make_bin(ctx, &ptx, bin, log)); + if (ctx->disk_cache) { pk = calloc(sizeof(kernel_key), 1); if (pk == NULL) { @@ -1214,8 +1260,7 @@ static void _cuda_freekernel(gpukernel *k) { static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, - const int *types, int flags, int *ret, - char **err_str) { + const int *types, int flags, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb src = STRB_STATIC_INIT; strb bin = STRB_STATIC_INIT; @@ -1223,41 +1268,46 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, strb *psrc; gpukernel *res; CUdevice dev; + CUresult err; unsigned int i; int major, minor; - int err; - if (count == 0) FAIL(NULL, GA_VALUE_ERROR); + if (count == 0) { + error_set(ctx->err, GA_VALUE_ERROR, "String count is 0"); + return NULL; + } - if (flags & GA_USE_OPENCL) - FAIL(NULL, GA_DEVSUP_ERROR); + if (flags & GA_USE_OPENCL) { + error_set(ctx->err, GA_DEVSUP_ERROR, "OpenCL kernel not supported on cuda devices"); + return NULL; + } cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&dev); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - FAIL(NULL, GA_IMPL_ERROR); - } - ctx->err = get_cc(dev, &major, &minor); - if (ctx->err != CUDA_SUCCESS) { + err = cuCtxGetDevice(&dev); + if (err != CUDA_SUCCESS) { cuda_exit(ctx); - FAIL(NULL, GA_IMPL_ERROR); + error_cuda(ctx->err, "cuCtxGetDevice", err); + return NULL; } + if (get_cc(dev, &major, &minor, ctx->err) != GA_NO_ERROR) + return ctx->err->code; + // GA_USE_CLUDA is done later // GA_USE_SMALL will always work // GA_USE_HALF should always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); - FAIL(NULL, GA_DEVSUP_ERROR); + error_set(ctx->err, GA_DEVSUP_ERROR, "Requested double support and current device doesn't support them"); + return NULL; } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); - FAIL(NULL, GA_DEVSUP_ERROR); + error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex support is not there yet."); } if (flags & GA_USE_CLUDA) { @@ -1279,9 +1329,10 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, strb_append0(&src); if (strb_error(&src)) { + error_sys(ctx->err, "strb"); strb_clear(&src); cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); + return NULL; } res = (gpukernel *)cache_get(ctx->kernel_cache, &src); @@ -1291,8 +1342,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, return res; } - err = compile(ctx, &src, &bin, &log); - if (err != GA_NO_ERROR || strb_error(&bin)) { + if (compile(ctx, &src, &bin, &log) != GA_NO_ERROR) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; strb_appends(&debug_msg, "CUDA kernel compile failure ::\n"); @@ -1307,16 +1357,25 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, strb_clear(&bin); strb_clear(&log); cuda_exit(ctx); - FAIL(NULL, err); + return NULL; } strb_clear(&log); + if (strb_error(&bin)) { + error_sys(ctx->err, "strb"); + strb_clear(&src); + strb_clear(&bin); + cuda_exit(ctx); + return NULL; + } + res = calloc(1, sizeof(*res)); if (res == NULL) { + error_sys(ctx->err, "calloc") strb_clear(&src); strb_clear(&bin); cuda_exit(ctx); - FAIL(NULL, GA_SYS_ERROR); + return NULL; } /* Don't clear bin after this */ @@ -1326,34 +1385,38 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { + error_sys(ctx->err, "calloc"); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); + return NULL; } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { + error_sys(ctx->err, "calloc"); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); + return NULL; } - ctx->err = cuModuleLoadData(&res->m, bin.s); - if (ctx->err != CUDA_SUCCESS) { + err = cuModuleLoadData(&res->m, bin.s); + if (err != CUDA_SUCCESS) { + error_cuda(ctx->err, "cuModuleLoadData", err); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - FAIL(NULL, GA_IMPL_ERROR); + return NULL; } - ctx->err = cuModuleGetFunction(&res->k, res->m, fname); - if (ctx->err != CUDA_SUCCESS) { + err = cuModuleGetFunction(&res->k, res->m, fname); + if (err != CUDA_SUCCESS) { + error_cuda(ctx->err, "cuModuleGetFunction", err); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - FAIL(NULL, GA_IMPL_ERROR); + return NULL; } res->ctx = ctx; @@ -1384,8 +1447,9 @@ static void cuda_freekernel(gpukernel *k) { } static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) { + ASSERT_KER(k); if (i >= k->argcount) - return GA_VALUE_ERROR; + return error_set(k->ctx->err, GA_VALUE_ERROR, "index is beyond the last argument"); k->args[i] = arg; return GA_NO_ERROR; } @@ -1412,24 +1476,22 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, switch (n) { case 1: - ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared, - ctx->s, args, NULL); + CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, + shared, ctx->s, args, NULL)); break; case 2: - ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared, - ctx->s, args, NULL); + CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], gs[1], 1, + ls[0], ls[1], 1, shared, + ctx->s, args, NULL)); break; case 3: - ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2], - shared, ctx->s, args, NULL); + CUDA_EXIT_ON_ERROR(ctx, cuLaunchKernel(k->k, gs[0], gs[1], gs[2], + ls[0], ls[1], ls[2], shared, + ctx->s, args, NULL)); break; default: cuda_exit(ctx); - return GA_VALUE_ERROR; - } - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions") } for (i = 0; i < k->argcount; i++) { @@ -1447,7 +1509,7 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) { void *res = malloc(k->bin_sz); if (res == NULL) - return GA_MEMORY_ERROR; + return error_sys(k->ctx->err, "malloc"); memcpy(res, k->bin, k->bin_sz); *sz = k->bin_sz; *obj = res; @@ -1461,14 +1523,10 @@ static int cuda_sync(gpudata *b) { ASSERT_BUF(b); cuda_enter(ctx); if (ctx->flags & GA_CTX_SINGLE_STREAM) { - cuStreamSynchronize(ctx->s); + CUDA_EXIT_ON_ERROR(ctx, cuStreamSynchronize(ctx->s)); } else { - ctx->err = cuEventSynchronize(b->wev); - if (ctx->err != CUDA_SUCCESS) - err = GA_IMPL_ERROR; - ctx->err = cuEventSynchronize(b->rev); - if (ctx->err != CUDA_SUCCESS) - err = GA_IMPL_ERROR; + CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(b->wev)); + CUDA_EXIT_ON_ERROR(ctx, cuEventSynchronize(b->rev)); } cuda_exit(ctx); return err; @@ -1533,15 +1591,26 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, if (prop_id < GA_BUFFER_PROP_START) { if (ctx == NULL) - return GA_VALUE_ERROR; + return error_set(global_err, GA_VALUE_ERROR, + "Attempting to get a context property with no context"); } else if (prop_id < GA_KERNEL_PROP_START) { if (buf == NULL) - return GA_VALUE_ERROR; + return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, + "Attempting to get a buffer property with no buffer"); } else { if (k == NULL) - return GA_VALUE_ERROR; + return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, + "Attempting to get a kernel property with no kernel"); } +#define GETPROP(prop, type) do { \ + cuda_enter(ctx); \ + CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); \ + CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetAttribute(&i, (prop), id)); \ + cuda_exit(ctx); \ + *((type *)res) = i; \ + } while(0) + switch (prop_id) { CUdevice id; int i; @@ -1549,111 +1618,46 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetName((char *)res, 256, id); + CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); + CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetName((char *)res, 256, id)); cuda_exit(ctx); - return (ctx->err != CUDA_SUCCESS) ? GA_IMPL_ERROR : GA_NO_ERROR; + return GA_NO_ERROR; case GA_CTX_PROP_PCIBUSID: cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetPCIBusId((char *)res, 13, id); + CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); + CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetPCIBusId((char *)res, 13, id)); cuda_exit(ctx); - return (ctx->err != CUDA_SUCCESS) ? GA_IMPL_ERROR : GA_NO_ERROR; + return GA_NO_ERROR; case GA_CTX_PROP_LARGEST_MEMBLOCK: *((size_t *)res) = largest_size(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - *((size_t *)res) = i; - cuda_exit(ctx); + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, size_t); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - *((size_t *)res) = i; - cuda_exit(ctx); + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, size_t); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - *((unsigned int *)res) = i; - cuda_exit(ctx); + GETPROP(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, unsigned int) return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - *((size_t *)res) = i; - cuda_exit(ctx); + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, size_t); return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: - { - int e = load_libcublas(major, minor); - if (e != GA_NO_ERROR) - return e; - } + GA_CHECK(load_libcublas(major, minor, ctx->err)); *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; case GA_CTX_PROP_COMM_OPS: - *((gpuarray_comm_ops**)res) = &nccl_ops; - return GA_NO_ERROR; + *((gpuarray_comm_ops**)res) = &nccl_ops; + return GA_NO_ERROR; case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; @@ -1665,15 +1669,15 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_TOTAL_GMEM: cuda_enter(ctx); - ctx->err = cuMemGetInfo(&sz, (size_t *)res); + CUDA_EXIT_ON_ERROR(ctx, cuMemGetInfo(&sz, (size_t *)res)); cuda_exit(ctx); - return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; + return GA_NO_ERROR; case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); - ctx->err = cuMemGetInfo((size_t *)res, &sz); + CUDA_EXIT_ON_ERROR(cuMemGetInfo((size_t *)res, &sz)); cuda_exit(ctx); - return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; + return GA_NO_ERROR; case GA_CTX_PROP_NATIVE_FLOAT16: /* We claim that nobody supports this for now */ @@ -1681,99 +1685,27 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return CUDA_SUCCESS; case GA_CTX_PROP_MAXGSIZE0: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - cuda_exit(ctx); - *((size_t *)res) = i; + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE1: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - cuda_exit(ctx); - *((size_t *)res) = i; + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE2: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - cuda_exit(ctx); - *((size_t *)res) = i; + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE0: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - cuda_exit(ctx); - *((size_t *)res) = i; + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE1: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - cuda_exit(ctx); - *((size_t *)res) = i; + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, size_t); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE2: - cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - cuda_exit(ctx); - *((size_t *)res) = i; + GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, size_t); return GA_NO_ERROR; case GA_BUFFER_PROP_REFCNT: @@ -1791,27 +1723,15 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); - ctx->err = cuFuncGetAttribute(&i, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - k->k); + CUDA_EXIT_ON_ERROR(cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k)); cuda_exit(ctx); - if (ctx->err != CUDA_SUCCESS) - return GA_IMPL_ERROR; *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); - ctx->err = cuCtxGetDevice(&id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } - ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id); - if (ctx->err != CUDA_SUCCESS) { - cuda_exit(ctx); - return GA_IMPL_ERROR; - } + CUDA_EXIT_ON_ERROR(cuCtxGetDevice(&id)); + CUDA_EXIT_ON_ERROR(cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id)); cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; @@ -1835,7 +1755,7 @@ static const char *cuda_error(gpucontext *c) { if (ctx == NULL) return global_err->msg; else - return ctx->msg->msg; + return ctx->err->msg; return errstr; } diff --git a/src/private.h b/src/private.h index b1bd45f779..27258d3d23 100644 --- a/src/private.h +++ b/src/private.h @@ -27,6 +27,10 @@ extern "C" { } #endif +static inline int error_sys(error *e, const char *msg) { + return error_fmt(e, GA_SYS_ERROR, "%s: %s", msg, strerror(errno)); +} + #define ADDR32_MAX 4294967295L #define SADDR32_MIN -2147483648L #define SADDR32_MAX 2147483647L @@ -45,7 +49,7 @@ typedef struct _gpuarray_comm_ops gpuarray_comm_ops; const gpuarray_blas_ops *blas_ops; \ const gpuarray_comm_ops *comm_ops; \ void *blas_handle; \ - error *msg; \ + error *err; \ unsigned int refcnt; \ int flags; \ struct _gpudata *errbuf; \ @@ -77,7 +81,7 @@ typedef struct _partial_gpucomm { struct _gpuarray_buffer_ops { int (*get_platform_count)(unsigned int* platcount); int (*get_device_count)(unsigned int platform, unsigned int* devcount); - gpucontext *(*buffer_init)(int dev, int flags, int *ret); + gpucontext *(*buffer_init)(int dev, int flags); void (*buffer_deinit)(gpucontext *ctx); gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags, int *ret); diff --git a/src/private_cuda.h b/src/private_cuda.h index dc81ceba52..5e6ba7a46e 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -42,6 +42,13 @@ /* Keep in sync with the copy in gpuarray/extension.h */ #define DONTFREE 0x10000000 +static inline int error_cuda(error *e, const char *msg, CUresult err) { + const char *name, *descr; + cuGetErrorName(err, &name); + cuGetErrorString(err, &descr); + return error_fmt(e, GA_IMPL_ERROR, "%s: %s: %s", msg, name, descr); +} + #define GA_CUDA_EXIT_ON_ERROR(ctx, cmd) \ do { \ int err = (cmd); \ @@ -51,19 +58,18 @@ } \ } while (0) -#define CUDA_EXIT_ON_ERROR(ctx, cmd) \ - do { \ - (ctx)->err = (cmd); \ - if ((ctx)->err != CUDA_SUCCESS) { \ - cuda_exit((ctx)); \ - return GA_IMPL_ERROR; \ - } \ +#define CUDA_EXIT_ON_ERROR(ctx, cmd) \ + do { \ + CUresult err = (cmd); \ + if (err != CUDA_SUCCESS) { \ + cuda_exit((ctx)); \ + return error_cuda((ctx)->err, #cmd, err); \ + } \ } while (0) typedef struct _cuda_context { GPUCONTEXT_HEAD; CUcontext ctx; - CUresult err; CUstream s; CUstream mem_s; gpudata *freeblocks; From e7a7094e6630153ba6d02feda740a3335493f1ef Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 23 Mar 2017 17:26:00 -0400 Subject: [PATCH 256/597] Added error messages to the cublas bindings. --- src/gpuarray_blas_cuda_cublas.c | 518 +++++++++++++------------------- src/gpuarray_buffer_blas.c | 162 +++++----- src/private.h | 4 +- 3 files changed, 299 insertions(+), 385 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 880dbaa389..12b7c4ec55 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -24,6 +24,46 @@ static inline cublasOperation_t convT(cb_transpose trans) { } } +static const char *error(cublasStatus_t err) { + switch (err) { + case CUBLAS_STATUS_SUCCESS: + return "(cublas) Operation completed successfully."; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "(cublas) Library not initialized."; + case CUBLAS_STATUS_ALLOC_FAILED: + return "(cublas) GPU ressource allocation failed."; + case CUBLAS_STATUS_INVALID_VALUE: + return "(cublas) Invalid value."; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "(cublas) Operation not supported by device."; + case CUBLAS_STATUS_MAPPING_ERROR: + return "(cublas) Mapping error."; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "(cublas) Execution failed."; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "(cublas) Internal error."; + case CUBLAS_STATUS_NOT_SUPPORTED: + return "(cublas) Unsupported functionality."; + case CUBLAS_STATUS_LICENSE_ERROR: + return "(cublas) License error."; + default: + return "(cublas) Unknown error."; + } +} + +static inline int error_cublas(error *e, const char *msg, cublasStatus_t err) { + return error_fmt(e, (err == CUBLAS_STATUS_ARCH_MISMATCH) ? GA_DEVSUP_ERROR : GA_BLAS_ERROR, + "%s: %s", msg, error(err)); +} + +#define CUBLAS_EXIT_ON_ERROR(ctx, cmd) do { \ + cublasStatus_t err = (cmd); \ + if (err != CUBLAS_SUCCESS) { \ + cuda_exit(ctx); \ + return error_cublas((ctx)->err, #cmd, err); \ + } \ + } while(0) + typedef struct _blas_handle { cublasHandle_t h; GpuKernel sgemvBH_N_a1_b1_small; @@ -32,7 +72,6 @@ typedef struct _blas_handle { GpuKernel dgemvBH_T_a1_b1_small; GpuKernel sgerBH_gen_small; GpuKernel dgerBH_gen_small; - cublasStatus_t err; } blas_handle; #define LARGE_VAL(v) (v >= INT_MAX) @@ -180,25 +219,27 @@ static int setup(gpucontext *c) { handle = calloc(1, sizeof(*handle)); if (handle == NULL) - return GA_MEMORY_ERROR; - - handle->err = CUBLAS_STATUS_SUCCESS; + return error_sys(ctx->err, "calloc"); cuda_enter(ctx); err = cublasCreate(&handle->h); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); free(handle); - return GA_BLAS_ERROR; + return error_cublas(ctx->err, "cublasCreate", err); } err = cublasSetStream(handle->h, ctx->s); if (err != CUBLAS_STATUS_SUCCESS) { - e = GA_BLAS_ERROR; + e = error_cublas(ctx->err, "cublasSetStream", err); goto e1; } - cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST); + err = cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST); + if (err != CUBLAS_STATUS_SUCCESS) { + e = error_cublas(ctx->err, "cublasSetPointerMode", err); + goto e1; + } types[0] = GA_BUFFER; types[1] = GA_SIZE; @@ -283,39 +324,6 @@ static void teardown(gpucontext *c) { ctx->blas_handle = NULL; } -static const char *error(gpucontext *c) { - cuda_context *ctx = (cuda_context *)c; - blas_handle *handle = (blas_handle *)ctx->blas_handle; - - if (handle != NULL) { - switch (handle->err) { - case CUBLAS_STATUS_SUCCESS: - return "(cublas) Operation completed successfully."; - case CUBLAS_STATUS_NOT_INITIALIZED: - return "(cublas) Library not initialized."; - case CUBLAS_STATUS_ALLOC_FAILED: - return "(cublas) GPU ressource allocation failed."; - case CUBLAS_STATUS_INVALID_VALUE: - return "(cublas) Invalid value."; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "(cublas) Operation not supported by device."; - case CUBLAS_STATUS_MAPPING_ERROR: - return "(cublas) Mapping error."; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "(cublas) Execution failed."; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "(cublas) Internal error."; - case CUBLAS_STATUS_NOT_SUPPORTED: - return "(cublas) Unsupported functionality."; - case CUBLAS_STATUS_LICENSE_ERROR: - return "(cublas) License error."; - default: - return "(cublas) Unknown error."; - } - } - return "Blas handle not initialized, API error."; -} - static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, @@ -334,7 +342,7 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { /* swap A and B */ @@ -361,17 +369,11 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); - h->err = cublasSgemm(h->h, - convT(transA), convT(transB), M, N, K, - &alpha, ((float *)A->ptr) + offA, lda, - ((float *)B->ptr) + offB, ldb, &beta, - ((float *)C->ptr) + offC, ldc); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemm(h->h, + convT(transA), convT(transB), M, N, K, + &alpha, ((float *)A->ptr) + offA, lda, + ((float *)B->ptr) + offB, ldb, &beta, + ((float *)C->ptr) + offC, ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); @@ -399,7 +401,7 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { /* swap A and B */ @@ -426,17 +428,11 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); - h->err = cublasDgemm(h->h, - convT(transA), convT(transB), M, N, K, - &alpha, ((double *)A->ptr) + offA, lda, - ((double *)B->ptr) + offB, ldb, &beta, - ((double *)C->ptr) + offC, ldc); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemm(h->h, + convT(transA), convT(transB), M, N, K, + &alpha, ((double *)A->ptr) + offA, lda, + ((double *)B->ptr) + offB, ldb, &beta, + ((double *)C->ptr) + offC, ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); @@ -470,7 +466,7 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { /* swap A and B */ @@ -497,21 +493,15 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); - h->err = cublasSgemmEx(h->h, - convT(transA), convT(transB), M, N, K, - &alpha, ((uint16_t *)A->ptr) + offA, - CUDA_R_16F, - lda, ((uint16_t *)B->ptr) + offB, - CUDA_R_16F, - ldb, &beta, ((uint16_t *)C->ptr) + offC, - CUDA_R_16F, - ldc); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemmEx(h->h, convT(transA), convT(transB), + M, N, K, + &alpha, ((uint16_t *)A->ptr) + offA, + CUDA_R_16F, + lda, ((uint16_t *)B->ptr) + offB, + CUDA_R_16F, + ldb, &beta, ((uint16_t *)C->ptr) + offC, + CUDA_R_16F, + ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); @@ -521,15 +511,6 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } -static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **B, size_t *offB, size_t ldb, - float beta, gpudata **C, size_t *offC, size_t ldc, - size_t batchCount) { - return GA_DEVSUP_ERROR; -} - static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, @@ -544,12 +525,10 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, const size_t threshold = 650; cb_transpose transT; - if (batchCount == 0) return GA_NO_ERROR; - if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); ASSERT_BUF(A[0]); ctx = A[0]->ctx; @@ -586,19 +565,13 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); - h->err = cublasSgemm(h->h, - convT(transA), convT(transB), - M, N, K, &alpha, - ((float*)A[i]->ptr) + offA[i], lda, - ((float*)B[i]->ptr) + offB[i], ldb, - &beta, - ((float*)C[i]->ptr) + offC[i], ldc); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemm(h->h, + convT(transA), convT(transB), + M, N, K, &alpha, + ((float*)A[i]->ptr) + offA[i], lda, + ((float*)B[i]->ptr) + offB[i], ldb, + &beta, + ((float*)C[i]->ptr) + offC[i], ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); @@ -611,6 +584,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, float **C_l = T_l + (batchCount * 2); gpudata *Ta; CUdeviceptr Aa, Ba, Ca; + cublasStatus_t err; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); @@ -626,24 +600,30 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, Ta = gpudata_alloc((gpucontext *)ctx, sizeof(float *) * batchCount * 3, NULL, 0, NULL); + if (Ta == NULL) { + cuda_exit(ctx); + return ctx->err->code; + } Aa = *(CUdeviceptr *)Ta; Ba = Aa + (batchCount * sizeof(float *)); Ca = Aa + (batchCount * sizeof(float *) * 2); - gpudata_write(Ta, 0, T_l, sizeof(float *) * batchCount * 3); + if (gpudata_write(Ta, 0, T_l, sizeof(float *) * batchCount * 3) != GA_NO_ERROR) { + gpudata_release(Ta); + cuda_exit(ctx); + return ctx->err->code; + } - h->err = cublasSgemmBatched(h->h, - convT(transA), convT(transB), - M, N, K, &alpha, - (const float **)Aa, lda, - (const float **)Ba, ldb, &beta, - (float **)Ca, ldc, batchCount); + err = cublasSgemmBatched(h->h, + convT(transA), convT(transB), + M, N, K, &alpha, + (const float **)Aa, lda, + (const float **)Ba, ldb, &beta, + (float **)Ca, ldc, batchCount); gpudata_release(Ta); - if (h->err != CUBLAS_STATUS_SUCCESS) { + if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; + return error_cublas(ctx, "cublasSgemmBatched", err); } for (i = 0; i < batchCount; i++) { @@ -671,12 +651,10 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, const size_t threshold = 650; cb_transpose transT; - if (batchCount == 0) return GA_NO_ERROR; - if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); ASSERT_BUF(A[0]); ctx = A[0]->ctx; @@ -713,19 +691,13 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); - h->err = cublasDgemm(h->h, - convT(transA), convT(transB), - M, N, K, &alpha, - (double*)A[i]->ptr + offA[i], lda, - (double*)B[i]->ptr + offB[i], ldb, - &beta, - (double*)C[i]->ptr + offC[i], ldc); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemm(h->h, + convT(transA), convT(transB), + M, N, K, &alpha, + (double*)A[i]->ptr + offA[i], lda, + (double*)B[i]->ptr + offB[i], ldb, + &beta, + (double*)C[i]->ptr + offC[i], ldc); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); @@ -738,6 +710,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, double **C_l = T_l + (batchCount * 2); gpudata *Ta; CUdeviceptr Aa, Ba, Ca; + cublasStatus_t err; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); @@ -753,24 +726,30 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, Ta = gpudata_alloc((gpucontext *)ctx, sizeof(double *) * batchCount * 3, NULL, 0, NULL); + if (Ta == NULL) { + cuda_exit(ctx); + return ctx->err->code; + } Aa = *(CUdeviceptr *)Ta; Ba = Aa + (batchCount * sizeof(double *)); Ca = Aa + (batchCount * sizeof(double *) * 2); - gpudata_write(Ta, 0, T_l, sizeof(double *) * batchCount * 3); + if (gpudata_write(Ta, 0, T_l, sizeof(double *) * batchCount * 3) != GA_NO_ERROR) { + gpudata_release(Ta); + cuda_exit(ctx); + return ctx->err->code; + } - h->err = cublasDgemmBatched(h->h, - convT(transA), convT(transB), - M, N, K, &alpha, - (const double **)Aa, lda, - (const double **)Ba, ldb, &beta, - (double **)Ca, ldc, batchCount); + err = cublasDgemmBatched(h->h, + convT(transA), convT(transB), + M, N, K, &alpha, + (const double **)Aa, lda, + (const double **)Ba, ldb, &beta, + (double **)Ca, ldc, batchCount); gpudata_release(Ta); - if (h->err != CUBLAS_STATUS_SUCCESS) { + if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; + return error_cublas(ctx->err, "cublasDgemmBatched", err); } for (i = 0; i < batchCount; i++) { @@ -784,14 +763,6 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } -static int hdot( - size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z, size_t offZ) { - return GA_DEVSUP_ERROR; -} - static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, @@ -805,7 +776,8 @@ static int sdot( ASSERT_BUF(Y); ASSERT_BUF(Z); - if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + if (LARGE_VAL(N)) + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); cuda_enter(ctx); @@ -814,14 +786,13 @@ static int sdot( GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device - cublasGetPointerMode(h->h, &pmode); - cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); - h->err = cublasSdot( - h->h, N, - ((float*)X->ptr) + offX, incX, - ((float*)Y->ptr) + offY, incY, - ((float*)Z->ptr) + offZ); - cublasSetPointerMode(h->h, pmode); + CUBLAS_EXIT_ON_ERROR(ctx, cublasGetPointerMode(h->h, &pmode)); + CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE)); + CUBLAS_EXIT_ON_ERROR(ctx, cublasSdot(h->h, N, + ((float*)X->ptr) + offX, incX, + ((float*)Y->ptr) + offY, incY, + ((float*)Z->ptr) + offZ)); + CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, pmode)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); @@ -845,7 +816,9 @@ static int ddot( ASSERT_BUF(Y); ASSERT_BUF(Z); - if (LARGE_VAL(N)) return GA_XLARGE_ERROR; + if (LARGE_VAL(N)) + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); + cuda_enter(ctx); @@ -854,14 +827,13 @@ static int ddot( GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Z, CUDA_WAIT_WRITE)); // we should store dot result on device - cublasGetPointerMode(h->h, &pmode); - cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE); - h->err = cublasDdot( - h->h, N, - ((double*)X->ptr) + offX, incX, - ((double*)Y->ptr) + offY, incY, - ((double*)Z->ptr) + offZ); - cublasSetPointerMode(h->h, pmode); + CUBLAS_EXIT_ON_ERROR(ctx, cublasGetPointerMode(h->h, &pmode)); + CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, CUBLAS_POINTER_MODE_DEVICE)); + CUBLAS_EXIT_ON_ERROR(ctx, cublasDdot(h->h, N, + ((double*)X->ptr) + offX, incX, + ((double*)Y->ptr) + offY, incY, + ((double*)Z->ptr) + offZ)); + CUBLAS_EXIT_ON_ERROR(ctx, cublasSetPointerMode(h->h, pmode)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); @@ -872,13 +844,6 @@ static int ddot( return GA_NO_ERROR; } -static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, - float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, - float beta, gpudata *Y, size_t offY, int incY) { - return GA_DEVSUP_ERROR; -} - static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, @@ -893,7 +858,7 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = N; @@ -913,17 +878,11 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); - h->err = cublasSgemv(h->h, - convT(transA), M, N, &alpha, - ((float *)A->ptr) + offA, lda, - ((float *)X->ptr) + offX, incX, - &beta, ((float *)Y->ptr) + offY, incY); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemv(h->h, + convT(transA), M, N, &alpha, + ((float *)A->ptr) + offA, lda, + ((float *)X->ptr) + offX, incX, + &beta, ((float *)Y->ptr) + offY, incY)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); @@ -948,7 +907,7 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = N; @@ -968,17 +927,11 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); - h->err = cublasDgemv(h->h, - convT(transA), M, N, &alpha, - ((double *)A->ptr) + offA, lda, - ((double *)X->ptr) + offX, incX, - &beta, ((double *)Y->ptr) + offY, incY); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasDgemv(h->h, + convT(transA), M, N, &alpha, + ((double *)A->ptr) + offA, lda, + ((double *)X->ptr) + offX, incX, + &beta, ((double *)Y->ptr) + offY, incY)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); @@ -989,15 +942,6 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, return GA_NO_ERROR; } -static int hgemvBatch(cb_order order, cb_transpose transA, - size_t M, size_t N, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - static int sgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata **A, size_t *offA, size_t lda, @@ -1013,10 +957,13 @@ static int sgemvBatch(cb_order order, cb_transpose transA, gpudata *Aa, *xa, *ya; int err; - if (flags != 0) return GA_INVALID_ERROR; - if (batchCount == 0) return GA_NO_ERROR; + ASSERT_BUF(A[0]); - if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR; + ctx = A[0]->ctx; + + if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags not set to 0"); + + if (alpha != 1.0 || beta != 1.0) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Only alpha = 1 and beta = 1 are supported for now"); if (M < 512) { ls[0] = 32; @@ -1045,10 +992,6 @@ static int sgemvBatch(cb_order order, cb_transpose transA, } } - ASSERT_BUF(A[0]); - - ctx = A[0]->ctx; - cuda_enter(ctx); { @@ -1070,21 +1013,21 @@ static int sgemvBatch(cb_order order, cb_transpose transA, } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, A_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (Aa == NULL) - return err; + return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, x_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); - return err; + return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, y_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); - return err; + return ctx->err->code; } } @@ -1137,10 +1080,13 @@ static int dgemvBatch(cb_order order, cb_transpose transA, gpudata *Aa, *xa, *ya; int err; - if (flags != 0) return GA_INVALID_ERROR; - if (batchCount == 0) return GA_NO_ERROR; + ASSERT_BUF(A[0]); + + ctx = A[0]->ctx; + + if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags not set to 0"); - if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR; + if (alpha != 1.0 || beta != 1.0) return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Only alpha = 1 and beta = 1 are supported for now"); if (M < 512) { ls[0] = 32; @@ -1169,10 +1115,6 @@ static int dgemvBatch(cb_order order, cb_transpose transA, } } - ASSERT_BUF(A[0]); - - ctx = A[0]->ctx; - cuda_enter(ctx); { @@ -1194,21 +1136,21 @@ static int dgemvBatch(cb_order order, cb_transpose transA, } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, A_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (Aa == NULL) - return err; + return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, x_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); - return err; + return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, y_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); - return err; + return ctx->err->code; } } @@ -1248,12 +1190,6 @@ static int dgemvBatch(cb_order order, cb_transpose transA, } -static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, - size_t offX, int incX, gpudata *Y, size_t offY, int incY, - gpudata *A, size_t offA, size_t lda) { - return GA_DEVSUP_ERROR; -} - static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { @@ -1268,7 +1204,7 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = M; @@ -1291,16 +1227,10 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); - h->err = cublasSger(h->h, M, N, &alpha, - ((float *)X->ptr) + offX, incX, - ((float *)Y->ptr) + offY, incY, - ((float *)A->ptr) + offA, lda); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasSger(h->h, M, N, &alpha, + ((float *)X->ptr) + offX, incX, + ((float *)Y->ptr) + offY, incY, + ((float *)A->ptr) + offA, lda)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); @@ -1325,7 +1255,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) - return GA_XLARGE_ERROR; + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); if (order == cb_c) { t = M; @@ -1348,16 +1278,10 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); - h->err = cublasDger(h->h, M, N, &alpha, - ((double *)X->ptr) + offX, incX, - ((double *)Y->ptr) + offY, incY, - ((double *)A->ptr) + offA, lda); - if (h->err != CUBLAS_STATUS_SUCCESS) { - cuda_exit(ctx); - if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) - return GA_DEVSUP_ERROR; - return GA_BLAS_ERROR; - } + CUBLAS_EXIT_ON_ERROR(ctx, cublasDger(h->h, M, N, &alpha, + ((double *)X->ptr) + offX, incX, + ((double *)Y->ptr) + offY, incY, + ((double *)A->ptr) + offA, lda)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); @@ -1368,14 +1292,6 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, return GA_NO_ERROR; } -static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, - gpudata **A, size_t *offA, size_t lda, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, @@ -1389,8 +1305,11 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, gpudata *Aa, *xa, *ya; int err; - if (flags != 0) return GA_INVALID_ERROR; - if (batchCount == 0) return GA_NO_ERROR; + ASSERT_BUF(x[0]); + + ctx = x[0]->ctx; + + if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); if (incX == 1) { if (ls[0] > 32) { @@ -1432,10 +1351,6 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, y = T; } - ASSERT_BUF(x[0]); - - ctx = x[0]->ctx; - cuda_enter(ctx); { @@ -1457,21 +1372,21 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, A_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (Aa == NULL) - return err; + return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, x_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); - return err; + return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(float *) * batchCount, y_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); - return err; + return ctx->err->code; } } @@ -1521,8 +1436,11 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata *Aa, *xa, *ya; int err; - if (flags != 0) return GA_INVALID_ERROR; - if (batchCount == 0) return GA_NO_ERROR; + ASSERT_BUF(x[0]); + + ctx = x[0]->ctx; + + if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); if (incX == 1) { if (ls[0] > 32) { @@ -1564,10 +1482,6 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, y = T; } - ASSERT_BUF(x[0]); - - ctx = x[0]->ctx; - cuda_enter(ctx); { @@ -1589,21 +1503,21 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, } Aa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, A_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (Aa == NULL) - return err; + return ctx->err->code; xa = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, x_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (xa == NULL) { cuda_ops.buffer_release(Aa); - return err; + return ctx->err->code; } ya = cuda_ops.buffer_alloc((gpucontext *)ctx, sizeof(double *) * batchCount, y_l, - GA_BUFFER_INIT, &err); + GA_BUFFER_INIT); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); - return err; + return ctx->err->code; } } @@ -1629,7 +1543,6 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, return err; } - for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ)); @@ -1643,26 +1556,25 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpuarray_blas_ops cublas_ops = { setup, teardown, - error, - hdot, /* TODO */ + NULL, /* hdot */ sdot, ddot, - hgemv, /* TODO */ + NULL, /* hgemv */ sgemv, dgemv, hgemm, sgemm, dgemm, - hger, /* TODO */ + NULL, /* hger */ sger, dger, - hgemmBatch, /* TODO */ + NULL, /* hgemmBatch */ sgemmBatch, dgemmBatch, - hgemvBatch, /* TODO */ + NULL, /* hgemvBatch */ sgemvBatch, dgemvBatch, - hgerBatch, /* TODO */ + NULL, /* hgerBatch */ sgerBatch, dgerBatch }; diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 3fdc525e78..eb087c707f 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -14,18 +14,23 @@ void gpublas_teardown(gpucontext *ctx) { } const char *gpublas_error(gpucontext *ctx) { - if (ctx->blas_ops != NULL) - return ctx->blas_ops->error(ctx); - return "No blas ops available, API error."; + return ctx->err->msg; } +#define BLAS_OP(buf,name, args) \ + gpucontext *ctx = gpudata_context(buf); \ + if (ctx->blas_ops->name) \ + return ctx->blas_ops->name args; \ + else \ + return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by device or missing library: %s", #name) + + int gpublas_hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { - return gpudata_context(X)->blas_ops->hdot( - N, X, offX, incX, Y, offY, incY, Z, offZ); + BLAS_OP(X, hdot, (N, X, offX, incX, Y, offY, incY, Z, offZ)); } int gpublas_sdot( @@ -33,8 +38,7 @@ int gpublas_sdot( gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { - return gpudata_context(X)->blas_ops->sdot( - N, X, offX, incX, Y, offY, incY, Z, offZ); + BLAS_OP(X, sdot, (N, X, offX, incX, Y, offY, incY, Z, offZ)); } int gpublas_ddot( @@ -42,8 +46,7 @@ int gpublas_ddot( gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { - return gpudata_context(X)->blas_ops->ddot( - N, X, offX, incX, Y, offY, incY, Z, offZ); + BLAS_OP(X, ddot, (N, X, offX, incX, Y, offY, incY, Z, offZ)); } int gpublas_hgemv(cb_order order, cb_transpose transA, @@ -52,9 +55,8 @@ int gpublas_hgemv(cb_order order, cb_transpose transA, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { - return gpudata_context(A)->blas_ops->hgemv( - order, transA, M, N, alpha, A, offA, lda, - X, offX, incX, beta, Y, offY, incY); + BLAS_OP(A, hgemv, (order, transA, M, N, alpha, A, offA, lda, + X, offX, incX, beta, Y, offY, incY)); } int gpublas_sgemv(cb_order order, cb_transpose transA, @@ -63,9 +65,8 @@ int gpublas_sgemv(cb_order order, cb_transpose transA, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { - return gpudata_context(A)->blas_ops->sgemv( - order, transA, M, N, alpha, A, offA, lda, - X, offX, incX, beta, Y, offY, incY); + BLAS_OP(A, sgemv, (order, transA, M, N, alpha, A, offA, lda, + X, offX, incX, beta, Y, offY, incY)); } int gpublas_dgemv(cb_order order, cb_transpose transA, @@ -74,9 +75,8 @@ int gpublas_dgemv(cb_order order, cb_transpose transA, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { - return gpudata_context(A)->blas_ops->dgemv( - order, transA, M, N, alpha, A, offA, lda, - X, offX, incX, beta, Y, offY, incY); + BLAS_OP(A, dgemv, (order, transA, M, N, alpha, A, offA, lda, + X, offX, incX, beta, Y, offY, incY)); } int gpublas_hgemm(cb_order order, cb_transpose transA, cb_transpose transB, @@ -84,9 +84,8 @@ int gpublas_hgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { - return gpudata_context(A)->blas_ops->hgemm( - order, transA, transB, M, N, K, alpha, A, offA, lda, - B, offB, ldb, beta, C, offC, ldc); + BLAS_OP(A, hgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda, + B, offB, ldb, beta, C, offC, ldc)); } int gpublas_sgemm(cb_order order, cb_transpose transA, cb_transpose transB, @@ -94,9 +93,8 @@ int gpublas_sgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { - return gpudata_context(A)->blas_ops->sgemm( - order, transA, transB, M, N, K, alpha, A, offA, lda, - B, offB, ldb, beta, C, offC, ldc); + BLAS_OP(A, sgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda, + B, offB, ldb, beta, C, offC, ldc)); } int gpublas_dgemm(cb_order order, cb_transpose transA, cb_transpose transB, @@ -104,47 +102,63 @@ int gpublas_dgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { - return gpudata_context(A)->blas_ops->dgemm( - order, transA, transB, M, N, K, alpha, A, offA, lda, - B, offB, ldb, beta, C, offC, ldc); + BLAS_OP(A, dgemm, (order, transA, transB, M, N, K, alpha, A, offA, lda, + B, offB, ldb, beta, C, offC, ldc)); } int gpublas_hger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { - return gpudata_context(X)->blas_ops->hger( - order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda); + BLAS_OP(X, hger, + (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda)); } int gpublas_sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { - return gpudata_context(X)->blas_ops->sger( - order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda); + BLAS_OP(X, sger, + (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda)); } int gpublas_dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { - return gpudata_context(X)->blas_ops->dger( - order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda); -} + BLAS_OP(X, dger, + (order, M, N, alpha, X, offX, incX, Y, offY, incY, A, offA, lda)); +} + +#define BLAS_OPB(l, name, args) \ + gpucontext *ctx; \ + if (batchCount == 0) return GA_NO_ERROR; \ + ctx = gpudata_context(l[0]); \ + if (ctx->blas_ops->name) \ + return ctx->blas_ops->name args; \ + else \ + return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name) + +#define BLAS_OPBF(l, name, args) \ + gpucontext *ctx; \ + if (batchCount == 0) return GA_NO_ERROR; \ + ctx = gpudata_context(l[0]); \ + if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \ + if (ctx->blas_ops->name) \ + return ctx->blas_ops->name args; \ + else \ + return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name) int gpublas_hgemmBatch( - cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **B, size_t *offB, size_t ldb, - float beta, gpudata **C, size_t *offC, size_t ldc, - size_t batchCount, int flags) { - if (flags != 0) return GA_INVALID_ERROR; - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(A[0])->blas_ops->hgemmBatch( - order, transA, transB, M, N, K, alpha, A, offA, lda, - B, offB, ldb, beta, C, offC, ldc, batchCount); + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata **A, size_t *offA, size_t lda, + gpudata **B, size_t *offB, size_t ldb, + float beta, gpudata **C, size_t *offC, size_t ldc, + size_t batchCount, int flags) { + BLAS_OPBF(A, hgemmBatch, + (order, transA, transB, M, N, K, alpha, A, offA, lda, + B, offB, ldb, beta, C, offC, ldc, batchCount)); } int gpublas_sgemmBatch( @@ -154,11 +168,9 @@ int gpublas_sgemmBatch( gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags) { - if (flags != 0) return GA_INVALID_ERROR; - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(A[0])->blas_ops->sgemmBatch( - order, transA, transB, M, N, K, alpha, A, offA, lda, - B, offB, ldb, beta, C, offC, ldc, batchCount); + BLAS_OPBF(A, sgemmBatch, + (order, transA, transB, M, N, K, alpha, A, offA, lda, + B, offB, ldb, beta, C, offC, ldc, batchCount)); } int gpublas_dgemmBatch( @@ -168,11 +180,9 @@ int gpublas_dgemmBatch( gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags) { - if (flags != 0) return GA_INVALID_ERROR; - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(A[0])->blas_ops->dgemmBatch( - order, transA, transB, M, N, K, alpha, A, offA, lda, - B, offB, ldb, beta, C, offC, ldc, batchCount); + BLAS_OPBF(A, dgemmBatch, + (order, transA, transB, M, N, K, alpha, A, offA, lda, + B, offB, ldb, beta, C, offC, ldc, batchCount)); } int gpublas_hgemvBatch( @@ -182,10 +192,9 @@ int gpublas_hgemvBatch( gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(A[0])->blas_ops->hgemvBatch( - order, transA, M, N, alpha, A, offA, lda, x, offX, incX, - beta, y, offY, incY, batchCount, flags); + BLAS_OPB(A, hgemvBatch, + (order, transA, M, N, alpha, A, offA, lda, x, offX, incX, + beta, y, offY, incY, batchCount, flags)); } int gpublas_sgemvBatch( @@ -195,10 +204,9 @@ int gpublas_sgemvBatch( gpudata **x, size_t *offX, size_t incX, float beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(A[0])->blas_ops->sgemvBatch( - order, transA, M, N, alpha, A, offA, lda, x, offX, incX, - beta, y, offY, incY, batchCount, flags); + BLAS_OPB(A, sgemvBatch, + (order, transA, M, N, alpha, A, offA, lda, x, offX, incX, + beta, y, offY, incY, batchCount, flags)); } int gpublas_dgemvBatch( @@ -208,10 +216,9 @@ int gpublas_dgemvBatch( gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(A[0])->blas_ops->dgemvBatch( - order, transA, M, N, alpha, A, offA, lda, x, offX, incX, - beta, y, offY, incY, batchCount, flags); + BLAS_OPB(A, dgemvBatch, + (order, transA, M, N, alpha, A, offA, lda, x, offX, incX, + beta, y, offY, incY, batchCount, flags)); } int gpublas_hgerBatch(cb_order order, size_t M, size_t N, float alpha, @@ -219,10 +226,9 @@ int gpublas_hgerBatch(cb_order order, size_t M, size_t N, float alpha, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(x[0])->blas_ops->hgerBatch( - order, M, N, alpha, x, offX, incX, y, offY, incY, - A, offA, lda, batchCount, flags); + BLAS_OPB(x, hgerBatch, + (order, M, N, alpha, x, offX, incX, y, offY, incY, + A, offA, lda, batchCount, flags)); } int gpublas_sgerBatch(cb_order order, size_t M, size_t N, float alpha, @@ -230,10 +236,9 @@ int gpublas_sgerBatch(cb_order order, size_t M, size_t N, float alpha, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(x[0])->blas_ops->sgerBatch( - order, M, N, alpha, x, offX, incX, y, offY, incY, - A, offA, lda, batchCount, flags); + BLAS_OPB(x, sgerBatch, + (order, M, N, alpha, x, offX, incX, y, offY, incY, + A, offA, lda, batchCount, flags)); } int gpublas_dgerBatch(cb_order order, size_t M, size_t N, double alpha, @@ -241,8 +246,7 @@ int gpublas_dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { - if (batchCount == 0) return GA_NO_ERROR; - return gpudata_context(x[0])->blas_ops->dgerBatch( - order, M, N, alpha, x, offX, incX, y, offY, incY, - A, offA, lda, batchCount, flags); + BLAS_OPB(x, dgerBatch, + (order, M, N, alpha, x, offX, incX, y, offY, incY, + A, offA, lda, batchCount, flags)); } diff --git a/src/private.h b/src/private.h index 27258d3d23..9c24af45d0 100644 --- a/src/private.h +++ b/src/private.h @@ -83,8 +83,7 @@ struct _gpuarray_buffer_ops { int (*get_device_count)(unsigned int platform, unsigned int* devcount); gpucontext *(*buffer_init)(int dev, int flags); void (*buffer_deinit)(gpucontext *ctx); - gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags, - int *ret); + gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags); void (*buffer_retain)(gpudata *b); void (*buffer_release)(gpudata *b); int (*buffer_share)(gpudata *a, gpudata *b, int *ret); @@ -117,7 +116,6 @@ struct _gpuarray_buffer_ops { struct _gpuarray_blas_ops { int (*setup)(gpucontext *ctx); void (*teardown)(gpucontext *ctx); - const char *(*error)(gpucontext *ctx); int (*hdot)( size_t N, gpudata *X, size_t offX, size_t incX, From ad9173ece44e5cecdd1c12c6e4593668bcdab127 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 23 Mar 2017 18:56:08 -0400 Subject: [PATCH 257/597] Fix all the compile problems for the cuda stuff. --- Makefile | 2 +- src/cache.h | 3 +- src/cache/disk.c | 20 +++- src/cache/twoq.c | 2 + src/gpuarray/buffer.h | 2 +- src/gpuarray_array.c | 3 +- src/gpuarray_blas_cuda_cublas.c | 20 ++-- src/gpuarray_buffer.c | 8 +- src/gpuarray_buffer_collectives.c | 5 +- src/gpuarray_buffer_cuda.c | 96 ++++++++-------- src/gpuarray_collectives_cuda_nccl.c | 162 +++++++++++++-------------- src/loaders/libnvrtc.c | 4 +- src/loaders/libnvrtc.fn | 15 +-- src/loaders/libnvrtc.h | 4 +- src/private.h | 7 +- src/util/error.c | 4 +- src/util/error.h | 11 +- 17 files changed, 198 insertions(+), 170 deletions(-) diff --git a/Makefile b/Makefile index e0cb7d3bf8..915254727b 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ debug: install-debugc py .PHONY: install-debugc py debug install-relc rel config -Debug/Makefile: Debug Makefile.conf +Debug/Makefile: Makefile.conf mkdir -p Debug ifndef INSTALL_PREFIX (cd Debug && NUM_DEVS=${NUM_DEVS} DEV_NAMES=${DEV_NAMES} cmake .. -DCMAKE_BUILD_TYPE=Debug) diff --git a/src/cache.h b/src/cache.h index e291aca67b..47901c2beb 100644 --- a/src/cache.h +++ b/src/cache.h @@ -89,7 +89,8 @@ cache *cache_twoq(size_t hot_size, size_t warm_size, cache *cache_disk(const char *dirpath, cache *mem, kwrite_fn kwrite, vwrite_fn vwrite, - kread_fn kread, vread_fn vread); + kread_fn kread, vread_fn vread, + error *e); /* API functions */ static inline int cache_add(cache *c, cache_key_t k, cache_value_t v) { diff --git a/src/cache/disk.c b/src/cache/disk.c index 601690a421..7297dade4d 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -397,7 +397,7 @@ static void disk_destroy(cache *_c) { cache *cache_disk(const char *dirpath, cache *mem, kwrite_fn kwrite, vwrite_fn vwrite, - kread_fn kread, vread_fn vread) { + kread_fn kread, vread_fn vread, error *e) { struct stat st; disk_cache *res; char *dirp; @@ -414,7 +414,10 @@ cache *cache_disk(const char *dirpath, cache *mem, dirp = malloc(dirl + 1); /* With the NUL */ - if (dirp == NULL) return NULL; + if (dirp == NULL) { + error_sys(e, "malloc"); + return NULL; + } strlcpy(dirp, dirpath, dirl + 1); @@ -425,6 +428,7 @@ cache *cache_disk(const char *dirpath, cache *mem, if (ensurep(NULL, dirp) != 0) { free(dirp); + error_sys(e, "ensurep"); return NULL; } @@ -433,18 +437,24 @@ cache *cache_disk(const char *dirpath, cache *mem, mkdir(dirp, 0777); /* This may fail, but it's ok */ - if (lstat(dirp, &st) != 0) + if (lstat(dirp, &st) != 0) { + error_sys(e, "lstat"); return NULL; + } /* Restore the good path at the end */ dirp[dirl - 1] = sep; - if (!(st.st_mode & S_IFDIR)) + if (!(st.st_mode & S_IFDIR)) { + error_set(e, GA_SYS_ERROR, "Cache path exists but is not a directory"); return NULL; + } res = calloc(sizeof(*res), 1); - if (res == NULL) + if (res == NULL) { + error_sys(e, "calloc"); return NULL; + } res->dirp = dirp; res->mem = mem; diff --git a/src/cache/twoq.c b/src/cache/twoq.c index 7ff85344fc..4f14e557b7 100644 --- a/src/cache/twoq.c +++ b/src/cache/twoq.c @@ -1,6 +1,8 @@ #include #include +#include + #include "cache.h" #include "private_config.h" diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 800756a072..0a4c921808 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -500,7 +500,7 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, * * This can be use to cache kernel binaries after compilation of a * specific device. The kernel can be recreated by calling - * kernel_alloc with the binary and size and passing `GA_USE_BINARY` + * gpukernel_alloc with the binary and size and passing `GA_USE_BINARY` * as the use flags. * * The returned pointer is allocated and must be freed by the caller. diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 45a2a1186e..069cdd3e02 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -71,7 +71,8 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { if (ctx->extcopy_cache == NULL) ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash, extcopy_free, - (cache_freev_fn)GpuElemwise_free); + (cache_freev_fn)GpuElemwise_free, + ctx->err); if (ctx->extcopy_cache == NULL) return GA_MISC_ERROR; if (cache_add(ctx->extcopy_cache, aa, k) != 0) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 12b7c4ec55..c207f84a33 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -24,7 +24,7 @@ static inline cublasOperation_t convT(cb_transpose trans) { } } -static const char *error(cublasStatus_t err) { +static const char *estr(cublasStatus_t err) { switch (err) { case CUBLAS_STATUS_SUCCESS: return "(cublas) Operation completed successfully."; @@ -53,12 +53,12 @@ static const char *error(cublasStatus_t err) { static inline int error_cublas(error *e, const char *msg, cublasStatus_t err) { return error_fmt(e, (err == CUBLAS_STATUS_ARCH_MISMATCH) ? GA_DEVSUP_ERROR : GA_BLAS_ERROR, - "%s: %s", msg, error(err)); + "%s: %s", msg, estr(err)); } #define CUBLAS_EXIT_ON_ERROR(ctx, cmd) do { \ cublasStatus_t err = (cmd); \ - if (err != CUBLAS_SUCCESS) { \ + if (err != CUBLAS_STATUS_SUCCESS) { \ cuda_exit(ctx); \ return error_cublas((ctx)->err, #cmd, err); \ } \ @@ -525,13 +525,14 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, const size_t threshold = 650; cb_transpose transT; + ASSERT_BUF(A[0]); + ctx = A[0]->ctx; + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); - ASSERT_BUF(A[0]); - ctx = A[0]->ctx; h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); @@ -623,7 +624,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, gpudata_release(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); - return error_cublas(ctx, "cublasSgemmBatched", err); + return error_cublas(ctx->err, "cublasSgemmBatched", err); } for (i = 0; i < batchCount; i++) { @@ -651,13 +652,14 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, const size_t threshold = 650; cb_transpose transT; + ASSERT_BUF(A[0]); + ctx = A[0]->ctx; + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); - ASSERT_BUF(A[0]); - ctx = A[0]->ctx; h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); @@ -697,7 +699,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, (double*)A[i]->ptr + offA[i], lda, (double*)B[i]->ptr + offB[i], ldb, &beta, - (double*)C[i]->ptr + offC[i], ldc); + (double*)C[i]->ptr + offC[i], ldc)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 072dba9913..d6f11289a4 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -163,8 +163,12 @@ gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count, const char *fname, unsigned int numargs, const int *typecodes, int flags, int *ret, char **err_str) { - return ctx->ops->kernel_alloc(ctx, count, strings, lengths, fname, numargs, - typecodes, flags, ret, err_str); + gpukernel *res; + res = ctx->ops->kernel_alloc(ctx, count, strings, lengths, fname, numargs, + typecodes, flags, err_str); + if (res == NULL && ret) + *ret = ctx->err->code; + return res; } void gpukernel_retain(gpukernel *k) { diff --git a/src/gpuarray_buffer_collectives.c b/src/gpuarray_buffer_collectives.c index 8f33d70472..38e731286e 100644 --- a/src/gpuarray_buffer_collectives.c +++ b/src/gpuarray_buffer_collectives.c @@ -22,10 +22,7 @@ void gpucomm_free(gpucomm* comm) { } const char* gpucomm_error(gpucontext* ctx) { - if (ctx->comm_ops != NULL) - return ctx->error->msg; - return "No collective ops available, API error. Is a collectives library " - "installed?"; + return ctx->err->msg; } gpucontext* gpucomm_context(gpucomm* comm) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 202fb39570..33bbf7f31d 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -129,8 +129,9 @@ static int setup_done = 0; static int major = -1; static int minor = -1; static int setup_lib(void) { - int res, tmp; const char *ver; + CUresult err; + int res, tmp; if (!setup_done) { res = load_libcuda(global_err); @@ -167,6 +168,7 @@ static int cuda_get_platform_count(unsigned int* platcount) { static int cuda_get_device_count(unsigned int platform, unsigned int* devcount) { + CUresult err; int dv; // platform number gets ignored in CUDA implementation GA_CHECK(setup_lib()); @@ -203,7 +205,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { res->minor = minor; res->freeblocks = NULL; if (error_alloc(&res->err)) { - error_sets(global_err, GA_SYS_ERROR, "Could not create error context"); + error_set(global_err, GA_SYS_ERROR, "Could not create error context"); goto fail_errmsg; } if (detect_arch(ARCH_PREFIX, res->bin_id, global_err)) { @@ -213,7 +215,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { libraries that may do stuff on the NULL stream */ err = cuStreamCreate(&res->s, 0); if (err != CUDA_SUCCESS) { - error_cuda(global_err, "cuStreamCreate", err) + error_cuda(global_err, "cuStreamCreate", err); goto fail_stream; } if (ISSET(res->flags, GA_CTX_SINGLE_STREAM)) { @@ -223,7 +225,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { libraries that may do stuff on the NULL stream */ err = cuStreamCreate(&res->mem_s, 0); if (err != CUDA_SUCCESS) { - error_cuda(global_err, "cuStreamCreate", err) + error_cuda(global_err, "cuStreamCreate", err); goto fail_mem_stream; } } @@ -234,7 +236,7 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (cache_freek_fn)strb_free, (cache_freev_fn)cuda_freekernel, global_err); if (res->kernel_cache == NULL) - error_cuda(global_err, "cuStreamCreate", err) + error_cuda(global_err, "cuStreamCreate", err); goto fail_cache; cache_path = getenv("GPUARRAY_CACHE_PATH"); @@ -243,17 +245,19 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (cache_eq_fn)key_eq, (cache_hash_fn)key_hash, (cache_freek_fn)key_free, - (cache_freev_fn)strb_free); + (cache_freev_fn)strb_free, + res->err); if (mem_cache == NULL) { - // TODO use better error messages when they are available. - fprintf(stderr, "Error initializing disk cache, disabling\n"); + fprintf(stderr, "Error initializing mem cache for disk: %s\n", + res->err->msg); goto fail_disk_cache; } res->disk_cache = cache_disk(cache_path, mem_cache, (kwrite_fn)key_write, (vwrite_fn)kernel_write, (kread_fn)key_read, - (vread_fn)kernel_read); + (vread_fn)kernel_read, + res->err); if (res->disk_cache == NULL) { // TODO use better error messages when they are available. fprintf(stderr, "Error initializing disk cache, disabling\n"); @@ -267,13 +271,13 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { - error_cuda(e, "cuMemAllocHost", err); + error_cuda(global_err, "cuMemAllocHost", err); goto fail_errbuf; } memset(p, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); - res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16, e); + res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16); if (res->errbuf == NULL) { /* Copy the error from the context since we are getting rid of it */ error_set(global_err, res->err->code, res->err->msg); @@ -426,7 +430,7 @@ gpudata *cuda_make_buf(cuda_context *ctx, CUdeviceptr p, size_t sz) { size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; } -#define CHKFAIL(n, v) \ +#define CHKFAIL(e, n, v) \ if (err != CUDA_SUCCESS) { \ error_cuda(e, n, err); \ return v; \ @@ -498,13 +502,13 @@ static cuda_context *do_init(CUdevice dev, int flags, error *e) { if (flags & GA_CTX_MULTI_THREAD) fl = CU_CTX_SCHED_YIELD; err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); - CHKFAIL("cuDeviceGetAttribute", NULL); + CHKFAIL(e, "cuDeviceGetAttribute", NULL); if (i != 1) { error_set(e, GA_UNSUPPORTED_ERROR, "device does not support unified addressing"); return NULL; } err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act); - CHKFAIL("cuDevicePrimaryCtxGetState", NULL); + CHKFAIL(e, "cuDevicePrimaryCtxGetState", NULL); if (act == 1) { if ((cur_fl & fl) != fl) { error_set(e, GA_INVALID_ERROR, "device is already active and has unsupported flags"); @@ -512,12 +516,12 @@ static cuda_context *do_init(CUdevice dev, int flags, error *e) { } } else { err = cuDevicePrimaryCtxSetFlags(dev, fl); - CHKFAIL("cuDevicePrimaryCtxSetFlags", NULL); + CHKFAIL(e, "cuDevicePrimaryCtxSetFlags", NULL); } err = cuDevicePrimaryCtxRetain(&ctx, dev); - CHKFAIL("cuDevicePrimaryCtxRetain", NULL); + CHKFAIL(e, "cuDevicePrimaryCtxRetain", NULL); err = cuCtxPushCurrent(ctx); - CHKFAIL("cuCtxPushCurrent", NULL); + CHKFAIL(e, "cuCtxPushCurrent", NULL); res = cuda_make_ctx(ctx, flags); if (res == NULL) { cuDevicePrimaryCtxRelease(dev); @@ -545,11 +549,11 @@ static gpucontext *cuda_init(int ord, int flags) { if (ord == -1) { int i, c; err = cuDeviceGetCount(&c); - CHKFAIL("cuDeviceGetCount", NULL); + CHKFAIL(global_err, "cuDeviceGetCount", NULL); for (i = 0; i < c; i++) { err = cuDeviceGet(&dev, i); - CHKFAIL("cuDeviceGet", NULL); - res = do_init(dev, flags, NULL); + CHKFAIL(global_err, "cuDeviceGet", NULL); + res = do_init(dev, flags, global_err); if (res != NULL) return (gpucontext *)res; } @@ -557,8 +561,8 @@ static gpucontext *cuda_init(int ord, int flags) { return NULL; } else { err = cuDeviceGet(&dev, ord); - CHKFAIL("cuDeviceGet", NULL); - return (gpucontext *)do_init(dev, flags, ret); + CHKFAIL(global_err, "cuDeviceGet", NULL); + return (gpucontext *)do_init(dev, flags, global_err); } } static void cuda_deinit(gpucontext *c) { @@ -763,7 +767,8 @@ gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle *h, size_t sz) CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); if (err != CUDA_SUCCESS) { cuda_exit(ctx); - return error_cuda(ctx->err, "cuIpcOpenMemHandle", err); + error_cuda(ctx->err, "cuIpcOpenMemHandle", err); + return NULL; } d = cuda_make_buf(ctx, p, sz); if (d != NULL) @@ -1149,14 +1154,14 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { cujit_opts, cujit_opt_vals, &st); if (ctx->err != CUDA_SUCCESS) return error_cuda(ctx->err, "cuLinkCreate", err); - ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, + err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, "kernel code", 0, NULL, NULL); - if (ctx->err != CUDA_SUCCESS) { + if (err != CUDA_SUCCESS) { res = error_cuda(ctx->err, "cuLinkAddData", err); goto out; } - ctx->err = cuLinkComplete(st, &out, &out_size); - if (ctx->err != CUDA_SUCCESS) { + err = cuLinkComplete(st, &out, &out_size); + if (err != CUDA_SUCCESS) { res = error_cuda(ctx->err, "cuLinkComplete", err); goto out; } @@ -1176,7 +1181,6 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { strb *cbin; kernel_key k; kernel_key *pk; - int err; memset(&k, 0, sizeof(k)); k.version = 0; @@ -1204,29 +1208,33 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { if (ctx->disk_cache) { pk = calloc(sizeof(kernel_key), 1); if (pk == NULL) { - // TODO use better error messages - fprintf(stderr, "Error adding kernel to disk cache\n"); + error_sys(ctx->err, "calloc"); + fprintf(stderr, "Error adding kernel to disk cache: %s\n", + ctx->err->msg); return GA_NO_ERROR; } memcpy(pk, &k, KERNEL_KEY_MM); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { - // TODO use better error messages - fprintf(stderr, "Error adding kernel to disk cache\n"); + error_sys(ctx->err, "strb_appendb"); + fprintf(stderr, "Error adding kernel to disk cache %s\n", + ctx->err->msg); key_free((cache_key_t)pk); return GA_NO_ERROR; } cbin = strb_alloc(bin->l); if (cbin == NULL) { - // TODO use better error messages - fprintf(stderr, "Error adding kernel to disk cache\n"); + error_sys(ctx->err, "strb_alloc"); + fprintf(stderr, "Error adding kernel to disk cache: %s\n", + ctx->err->msg); key_free((cache_key_t)pk); return GA_NO_ERROR; } strb_appendb(cbin, bin); if (strb_error(cbin)) { - // TODO use better error messages - fprintf(stderr, "Error adding kernel to disk cache\n"); + error_sys(ctx->err, "strb_appendb"); + fprintf(stderr, "Error adding kernel to disk cache %s\n", + ctx->err->msg); key_free((cache_key_t)pk); strb_free(cbin); return GA_NO_ERROR; @@ -1292,7 +1300,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, } if (get_cc(dev, &major, &minor, ctx->err) != GA_NO_ERROR) - return ctx->err->code; + return NULL; // GA_USE_CLUDA is done later // GA_USE_SMALL will always work @@ -1371,7 +1379,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, res = calloc(1, sizeof(*res)); if (res == NULL) { - error_sys(ctx->err, "calloc") + error_sys(ctx->err, "calloc"); strb_clear(&src); strb_clear(&bin); cuda_exit(ctx); @@ -1491,7 +1499,7 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, break; default: cuda_exit(ctx); - return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions") + return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions"); } for (i = 0; i < k->argcount; i++) { @@ -1643,7 +1651,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: - GETPROP(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, unsigned int) + GETPROP(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, unsigned int); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: @@ -1675,7 +1683,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); - CUDA_EXIT_ON_ERROR(cuMemGetInfo((size_t *)res, &sz)); + CUDA_EXIT_ON_ERROR(ctx, cuMemGetInfo((size_t *)res, &sz)); cuda_exit(ctx); return GA_NO_ERROR; @@ -1723,15 +1731,15 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); - CUDA_EXIT_ON_ERROR(cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k)); + CUDA_EXIT_ON_ERROR(ctx, cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k)); cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); - CUDA_EXIT_ON_ERROR(cuCtxGetDevice(&id)); - CUDA_EXIT_ON_ERROR(cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id)); + CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); + CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id)); cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index 2713dd1a75..dc5a901ab7 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -12,17 +12,21 @@ #include "private.h" #include "private_cuda.h" +static inline int error_nccl(error *e, const char *msg, ncclResult_t err) { + return error_fmt(e, GA_COMM_ERROR, "%s: %s", msg, ncclGetErrorString(err)); +} + /** * Execute `cmd` and return appropriate code. Save a describing error message in * context. */ -#define NCCL_CHKFAIL(ctx, cmd) \ - do { \ - ncclResult_t nccl_err = (cmd); \ - if (nccl_err != ncclSuccess) { \ - return error_sets((ctx)->msg, GA_COMM_ERROR, ncclGetErrorString(nccl_err)); \ - } \ - return GA_NO_ERROR; \ +#define NCCL_CHKFAIL(ctx, cmd) \ + do { \ + ncclResult_t err = (cmd); \ + if (err != ncclSuccess) { \ + return error_nccl((ctx)->err, #cmd, err); \ + } \ + return GA_NO_ERROR; \ } while (0) /** @@ -30,13 +34,13 @@ * context. Exit from context and return \ref GA_COMM_ERROR if nccl does not * succeed. */ -#define NCCL_EXIT_ON_ERROR(ctx, cmd) \ - do { \ - ncclResult_t nccl_err = (cmd); \ - if (nccl_err != ncclSuccess) { \ - cuda_exit((ctx)); \ - return error_sets((ctx)->msg, GA_COMM_ERROR, ncclGetErrorString(nccl_err)); \ - } \ +#define NCCL_EXIT_ON_ERROR(ctx, cmd) \ + do { \ + ncclResult_t err = (cmd); \ + if (err != ncclSuccess) { \ + cuda_exit((ctx)); \ + return error_nccl((ctx)->err, #cmd, err); \ + } \ } while (0) //!< Link wrapped cuda core operations @@ -45,8 +49,6 @@ extern const gpuarray_buffer_ops cuda_ops; /** * Definition of struct _gpucomm * - * Done here in order to avoid ifdefs concerning nccl's existance in core code. - * * \note This must be the only "module" which manages the definition's contents. */ struct _gpucomm { @@ -59,13 +61,10 @@ struct _gpucomm { static int setup_done = 0; -static int setup_lib(void) { - int err; +static int setup_lib(error *e) { if (setup_done) return GA_NO_ERROR; - err = load_libnccl(); - if (err != GA_NO_ERROR) - return err; + GA_CHECK(load_libnccl(e)); setup_done = 1; return GA_NO_ERROR; } @@ -73,8 +72,8 @@ static int setup_lib(void) { /** * \brief Helper function to dereference a `comm`'s context and free memory */ -static void comm_clear(gpucomm* comm) { - cuda_ops.buffer_deinit((gpucontext*)comm->ctx); +static void comm_clear(gpucomm *comm) { + gpucontext_deref((gpucontext *)comm->ctx); CLEAR(comm); free(comm); } @@ -82,31 +81,31 @@ static void comm_clear(gpucomm* comm) { /** * \brief NCCL implementation of \ref gpucomm_new. */ -static int comm_new(gpucomm** comm_ptr, gpucontext* ctx, +static int comm_new(gpucomm **comm_ptr, gpucontext *ctx, gpucommCliqueId comm_id, int ndev, int rank) { - gpucomm* comm; - ncclResult_t nccl_err; + gpucomm *comm; + ncclResult_t err; ASSERT_CTX(ctx); - GA_CHECK(setup_lib()); + GA_CHECK(setup_lib(ctx->err)); comm = calloc(1, sizeof(*comm)); // Allocate memory if (comm == NULL) { *comm_ptr = NULL; // Set to NULL if failed - return GA_MEMORY_ERROR; + return error_sys(ctx->err, "calloc"); } - comm->ctx = (cuda_context*)ctx; // convert to underlying cuda context + comm->ctx = (cuda_context *)ctx; // convert to underlying cuda context // So that context would not be destroyed before communicator comm->ctx->refcnt++; cuda_enter(comm->ctx); // Use device - nccl_err = ncclCommInitRank(&comm->c, ndev, *((ncclUniqueId*)&comm_id), rank); + err = ncclCommInitRank(&comm->c, ndev, *((ncclUniqueId *)&comm_id), rank); cuda_exit(comm->ctx); TAG_COMM(comm); - if (nccl_err != ncclSuccess) { + if (err != ncclSuccess) { *comm_ptr = NULL; // Set to NULL if failed comm_clear(comm); - return error_sets(ctx->msg, GA_COMM_ERROR, ncclGetErrorString(nccl_err)); + return error_nccl(ctx->err, "ncclCommInitRank", err); } *comm_ptr = comm; return GA_NO_ERROR; @@ -115,7 +114,7 @@ static int comm_new(gpucomm** comm_ptr, gpucontext* ctx, /** * \brief NCCL implementation of \ref gpucomm_free. */ -static void comm_free(gpucomm* comm) { +static void comm_free(gpucomm *comm) { ASSERT_COMM(comm); cuda_enter(comm->ctx); ncclCommDestroy(comm->c); @@ -126,17 +125,17 @@ static void comm_free(gpucomm* comm) { /** * \brief NCCL implementation of \ref gpucomm_gen_clique_id. */ -static int generate_clique_id(gpucontext* c, gpucommCliqueId* comm_id) { +static int generate_clique_id(gpucontext *c, gpucommCliqueId *comm_id) { ASSERT_CTX(c); - GA_CHECK(setup_lib()); - NCCL_CHKFAIL(c, ncclGetUniqueId((ncclUniqueId*)comm_id)); + GA_CHECK(setup_lib(c->err)); + NCCL_CHKFAIL(c, ncclGetUniqueId((ncclUniqueId *)comm_id)); } /** * \brief NCCL implementation of \ref gpucomm_get_count. */ -static int get_count(const gpucomm* comm, int* gpucount) { +static int get_count(const gpucomm *comm, int *gpucount) { ASSERT_COMM(comm); NCCL_CHKFAIL(comm->ctx, ncclCommCount(comm->c, gpucount)); } @@ -144,7 +143,7 @@ static int get_count(const gpucomm* comm, int* gpucount) { /** * \brief NCCL implementation of \ref gpucomm_get_rank. */ -static int get_rank(const gpucomm* comm, int* rank) { +static int get_rank(const gpucomm *comm, int *rank) { ASSERT_COMM(comm); NCCL_CHKFAIL(comm->ctx, ncclCommUserRank(comm->c, rank)); } @@ -176,9 +175,7 @@ static inline ncclDataType_t convert_data_type(int typecode) { switch (typecode) { case GA_BYTE: return ncclChar; case GA_INT: return ncclInt; -#ifdef CUDA_HAS_HALF case GA_HALF: return ncclHalf; -#endif // CUDA_HAS_HALF case GA_FLOAT: return ncclFloat; case GA_DOUBLE: return ncclDouble; case GA_LONG: return ncclInt64; @@ -192,32 +189,33 @@ static inline ncclDataType_t convert_data_type(int typecode) { * nccl * collective operations. */ -static inline int check_restrictions(gpudata* src, size_t offsrc, gpudata* dest, - size_t offdest, size_t count, int typecode, - int opcode, gpucomm* comm, - ncclDataType_t* datatype, - ncclRedOp_t* op) { +static inline int check_restrictions(gpudata *src, size_t offsrc, + gpudata *dest, size_t offdest, + size_t count, int typecode, + int opcode, gpucomm *comm, + ncclDataType_t *datatype, + ncclRedOp_t *op) { size_t op_size; // Check if count is larger than INT_MAX // TODO remove whenif nccl adapts to size_t if (count > INT_MAX) - return GA_UNSUPPORTED_ERROR; + return error_set(comm->ctx->err, GA_XLARGE_ERROR, "Count too large for int"); // src, dest and comm must refer to the same context if (src->ctx != comm->ctx) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "source and comm context differ"); if (dest != NULL && dest->ctx != comm->ctx) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); // typecode must correspond to a valid ncclDataType_t if (datatype != NULL) { *datatype = convert_data_type(typecode); if (*datatype == nccl_NUM_TYPES) - return GA_INVALID_ERROR; + return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type"); } // opcode must correspond to a valid ncclRedOp_t if (op != NULL) { *op = convert_reduce_op(opcode); if (*op == nccl_NUM_OPS) - return GA_INVALID_ERROR; + return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op"); } // offsets must not be larger than gpudata's size itself // (else out of alloc-ed mem scope) @@ -226,23 +224,23 @@ static inline int check_restrictions(gpudata* src, size_t offsrc, gpudata* dest, // size to operate upon must be able to fit inside the gpudata (incl offsets) op_size = count * gpuarray_get_elsize(typecode); if ((src->sz - offsrc) < op_size) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "source too small for operation"); if (dest != NULL && (dest->sz - offdest) < op_size) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); return GA_NO_ERROR; } /** * \brief NCCL implementation of \ref gpucomm_reduce. */ -static int reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, +static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, int root, - gpucomm* comm) { + gpucomm *comm) { ncclRedOp_t op; ncclDataType_t datatype; - gpudata* dst = NULL; + gpudata *dst = NULL; int rank = 0; - cuda_context* ctx; + cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); @@ -265,11 +263,11 @@ static int reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, // change stream of nccl ops to enable concurrency if (rank == root) - NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void*)(src->ptr + offsrc), - (void*)(dest->ptr + offdest), count, + NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc), + (void *)(dest->ptr + offdest), count, datatype, op, root, comm->c, ctx->s)); else - NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void*)(src->ptr + offsrc), NULL, count, + NCCL_EXIT_ON_ERROR(ctx, ncclReduce((void *)(src->ptr + offsrc), NULL, count, datatype, op, root, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); @@ -284,12 +282,12 @@ static int reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, /** * \brief NCCL implementation of \ref gpucomm_all_reduce. */ -static int all_reduce(gpudata* src, size_t offsrc, gpudata* dest, +static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, - gpucomm* comm) { + gpucomm *comm) { ncclRedOp_t op; ncclDataType_t datatype; - cuda_context* ctx; + cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); @@ -306,8 +304,8 @@ static int all_reduce(gpudata* src, size_t offsrc, gpudata* dest, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency - NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void*)(src->ptr + offsrc), - (void*)(dest->ptr + offdest), count, + NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc), + (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); @@ -321,14 +319,14 @@ static int all_reduce(gpudata* src, size_t offsrc, gpudata* dest, /** * \brief NCCL implementation of \ref gpucomm_reduce_scatter. */ -static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, +static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, - int opcode, gpucomm* comm) { + int opcode, gpucomm *comm) { ncclRedOp_t op; ncclDataType_t datatype; int ndev = 0; size_t resc_size; - cuda_context* ctx; + cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); @@ -337,10 +335,10 @@ static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode, opcode, comm, &datatype, &op)); if (dest->ctx != comm->ctx) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); resc_size = count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; @@ -352,8 +350,8 @@ static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency - NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void*)(src->ptr + offsrc), - (void*)(dest->ptr + offdest), count, + NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc), + (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); @@ -367,11 +365,11 @@ static int reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, /** * \brief NCCL implementation of \ref gpucomm_broadcast. */ -static int broadcast(gpudata* array, size_t offset, size_t count, int typecode, - int root, gpucomm* comm) { +static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, + int root, gpucomm *comm) { ncclDataType_t datatype; int rank = 0; - cuda_context* ctx; + cuda_context *ctx; ASSERT_BUF(array); ASSERT_COMM(comm); @@ -389,7 +387,7 @@ static int broadcast(gpudata* array, size_t offset, size_t count, int typecode, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency - NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void*)(array->ptr + offset), count, + NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count, datatype, root, comm->c, ctx->s)); if (rank == root) @@ -405,13 +403,13 @@ static int broadcast(gpudata* array, size_t offset, size_t count, int typecode, /** * \brief NCCL implementation of \ref gpucomm_all_gather. */ -static int all_gather(gpudata* src, size_t offsrc, gpudata* dest, +static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, - gpucomm* comm) { + gpucomm *comm) { ncclDataType_t datatype; int ndev = 0; size_t resc_size; - cuda_context* ctx; + cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); @@ -419,11 +417,11 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest, GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); if (dest->ctx != comm->ctx) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); GA_CHECK(get_count(comm, &ndev)); resc_size = ndev * count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) - return GA_VALUE_ERROR; + return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; @@ -436,8 +434,8 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest, // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR( - ctx, ncclAllGather((void*)(src->ptr + offsrc), count, datatype, - (void*)(dest->ptr + offdest), comm->c, ctx->s)); + ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype, + (void *)(dest->ptr + offdest), comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c index bd84aadedf..fa5cfb2434 100644 --- a/src/loaders/libnvrtc.c +++ b/src/loaders/libnvrtc.c @@ -5,13 +5,13 @@ #include "dyn_load.h" #include "gpuarray/error.h" -#define DEF_PROC(name, args) t##name *name +#define DEF_PROC(rt, name, args) t##name *name #include "libnvrtc.fn" #undef DEF_PROC -#define DEF_PROC(name, args) \ +#define DEF_PROC(rt, name, args) \ name = (t##name *)ga_func_ptr(lib, #name, e); \ if (name == NULL) { \ return e->code; \ diff --git a/src/loaders/libnvrtc.fn b/src/loaders/libnvrtc.fn index 9ebda14112..3f32036310 100644 --- a/src/loaders/libnvrtc.fn +++ b/src/loaders/libnvrtc.fn @@ -1,7 +1,8 @@ -DEF_PROC(nvrtcCreateProgram, (nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames)); -DEF_PROC(nvrtcCompileProgram, (nvrtcProgram prog, int numOptions, const char **options)); -DEF_PROC(nvrtcDestroyProgram, (nvrtcProgram *prog)); -DEF_PROC(nvrtcGetProgramLog, (nvrtcProgram prog, char *log)); -DEF_PROC(nvrtcGetProgramLogSize, (nvrtcProgram prog, size_t *logSizeRet)); -DEF_PROC(nvrtcGetPTX, (nvrtcProgram prog, char *ptx)); -DEF_PROC(nvrtcGetPTXSize, (nvrtcProgram prog, size_t *ptxSizeRet)); \ No newline at end of file +DEF_PROC(nvrtcResult, nvrtcCreateProgram, (nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames)); +DEF_PROC(nvrtcResult, nvrtcCompileProgram, (nvrtcProgram prog, int numOptions, const char **options)); +DEF_PROC(nvrtcResult, nvrtcDestroyProgram, (nvrtcProgram *prog)); +DEF_PROC(nvrtcResult, nvrtcGetProgramLog, (nvrtcProgram prog, char *log)); +DEF_PROC(nvrtcResult, nvrtcGetProgramLogSize, (nvrtcProgram prog, size_t *logSizeRet)); +DEF_PROC(nvrtcResult, nvrtcGetPTX, (nvrtcProgram prog, char *ptx)); +DEF_PROC(nvrtcResult, nvrtcGetPTXSize, (nvrtcProgram prog, size_t *ptxSizeRet)); +DEF_PROC(const char *, nvrtcGetErrorString, (nvrtcResult result)); diff --git a/src/loaders/libnvrtc.h b/src/loaders/libnvrtc.h index 2395a677ea..5018830b4e 100644 --- a/src/loaders/libnvrtc.h +++ b/src/loaders/libnvrtc.h @@ -11,13 +11,13 @@ typedef struct _nvrtcProgram *nvrtcProgram; int load_libnvrtc(int major, int minor, error *e); -#define DEF_PROC(name, args) typedef nvrtcResult t##name args +#define DEF_PROC(rt, name, args) typedef rt t##name args #include "libnvrtc.fn" #undef DEF_PROC -#define DEF_PROC(name, args) extern t##name *name +#define DEF_PROC(rt, name, args) extern t##name *name #include "libnvrtc.fn" diff --git a/src/private.h b/src/private.h index 9c24af45d0..1a1584dfb1 100644 --- a/src/private.h +++ b/src/private.h @@ -27,10 +27,6 @@ extern "C" { } #endif -static inline int error_sys(error *e, const char *msg) { - return error_fmt(e, GA_SYS_ERROR, "%s: %s", msg, strerror(errno)); -} - #define ADDR32_MAX 4294967295L #define SADDR32_MIN -2147483648L #define SADDR32_MAX 2147483647L @@ -95,8 +91,7 @@ struct _gpuarray_buffer_ops { gpukernel *(*kernel_alloc)(gpucontext *ctx, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int numargs, - const int *typecodes, int flags, int *ret, - char **err_str); + const int *typecodes, int flags, char **err_str); void (*kernel_retain)(gpukernel *k); void (*kernel_release)(gpukernel *k); int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a); diff --git a/src/util/error.c b/src/util/error.c index d1682af855..263768fa30 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -20,7 +20,7 @@ void error_free(error *e) { int error_setall(error *e, int code, const char *msg) { e->code = code; - strlcpy(e->msg, msg, MSGBUF_LEN); + strlcpy(e->msg, msg, ERROR_MSGBUF_LEN); return code; } @@ -29,7 +29,7 @@ int error_fmt(error *e, int code, const char *fmt, ...) { e->code = code; va_start(ap, fmt); - vsnprintf(e->msg, MSGBUF_LEN, fmt, ap); + vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap); va_end(ap); return code; } diff --git a/src/util/error.h b/src/util/error.h index a1dda1f345..fc1ecb1663 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -1,11 +1,16 @@ #ifndef UTIL_ERROR_H #define UTIL_ERROR_H +#include +#include + +#include + /* 1024 - 4 for the int that goes after */ #define ERROR_MSGBUF_LEN 1020 typedef struct _error { - char msg[MSGBUF_LEN]; + char msg[ERROR_MSGBUF_LEN]; int code; } error; @@ -16,4 +21,8 @@ int error_fmt(error *e, int code, const char *fmt, ...); extern error *global_err; +static inline int error_sys(error *e, const char *msg) { + return error_fmt(e, GA_SYS_ERROR, "%s: %s", msg, strerror(errno)); +} + #endif From a4df0a7bc47a5807d9de1375e134bb33022ceb42 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 14:33:19 -0400 Subject: [PATCH 258/597] Add error messages to the opencl backend. --- src/gpuarray_buffer.c | 5 +- src/gpuarray_buffer_cuda.c | 4 +- src/gpuarray_buffer_opencl.c | 636 ++++++++++++++++------------------- src/private.h | 2 +- src/private_opencl.h | 35 +- 5 files changed, 334 insertions(+), 348 deletions(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index d6f11289a4..e070f29fc0 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -97,7 +97,10 @@ void gpudata_release(gpudata *b) { } int gpudata_share(gpudata *a, gpudata *b, int *ret) { - return ((partial_gpudata *)a)->ctx->ops->buffer_share(a, b, ret); + int res = ((partial_gpudata *)a)->ctx->ops->buffer_share(a, b); + if (res == -1 && ret) + *ret = ((partial_gpudata *)a)->ctx->err->code; + return res; } int gpudata_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 33bbf7f31d..b5c9599a2f 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -851,7 +851,7 @@ static void cuda_free(gpudata *d) { } } -static int cuda_share(gpudata *a, gpudata *b, int *ret) { +static int cuda_share(gpudata *a, gpudata *b) { ASSERT_BUF(a); ASSERT_BUF(b); return (a->ctx == b->ctx && a->sz != 0 && b->sz != 0 && @@ -1753,7 +1753,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; default: - return GA_INVALID_ERROR; + return error_fmt(ctx->err, GA_INVALID_ERROR, "Invalid property: %d", prop_id); } } diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 3e87f23c41..618f2a5888 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -22,32 +22,24 @@ #define _unused(x) ((void)x) #define SSIZE_MIN (-(SSIZE_MAX-1)) -static cl_int err; - -#define FAIL(v, e) { if (ret) *ret = e; return v; } -#define CHKFAIL(v) if (err != CL_SUCCESS) FAIL(v, GA_IMPL_ERROR) - - const gpuarray_buffer_ops opencl_ops; static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r); -static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags, - int *ret); +static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags); static void cl_release(gpudata *b); static void cl_free_ctx(cl_ctx *ctx); static gpukernel *cl_newkernel(gpucontext *ctx, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, - const int *types, int flags, int *ret, - char **err_str); + const int *types, int flags, char **err_str); static const char CL_CONTEXT_PREAMBLE[] = "#define GA_WARP_SIZE %lu\n"; // to be filled by cl_make_ctx() static int setup_done = 0; -static int setup_lib(void) { +static int setup_lib(error *e) { if (setup_done) return GA_NO_ERROR; - GA_CHECK(load_libopencl()); + GA_CHECK(load_libopencl(e)); setup_done = 1; return GA_NO_ERROR; } @@ -55,10 +47,8 @@ static int setup_lib(void) { static int cl_get_platform_count(unsigned int* platcount) { cl_uint nump; - GA_CHECK(setup_lib()); - err = clGetPlatformIDs(0, NULL, &nump); - if (err != CL_SUCCESS) - return GA_IMPL_ERROR; + GA_CHECK(setup_lib(global_err)); + CL_CHECK(global_err, clGetPlatformIDs(0, NULL, &nump)); *platcount = (unsigned int)nump; return GA_NO_ERROR; } @@ -67,6 +57,7 @@ static int cl_get_device_count(unsigned int platform, unsigned int* devcount) { cl_platform_id *ps; cl_platform_id p; cl_uint numd; + cl_int err; unsigned int platcount; /* This will load the library if needed */ @@ -74,38 +65,44 @@ static int cl_get_device_count(unsigned int platform, unsigned int* devcount) { ps = calloc(sizeof(*ps), platcount); if (ps == NULL) - return GA_MEMORY_ERROR; + return error_sys(global_err, "calloc"); err = clGetPlatformIDs(platcount, ps, NULL); if (err != CL_SUCCESS) { free(ps); - return GA_IMPL_ERROR; + return error_cl(global_err, "clGetPlatformIDs", err); } p = ps[platform]; err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd); free(ps); if (err != CL_SUCCESS) - return GA_IMPL_ERROR; + return error_cl(global_err, "clGetDeviceIds", err); *devcount = (unsigned int)numd; return GA_NO_ERROR; } -static cl_device_id get_dev(cl_context ctx, int *ret) { +static cl_device_id get_dev(cl_context ctx, error *e) { size_t sz; cl_device_id res; cl_device_id *ids; cl_int err; - err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz); - CHKFAIL(NULL); + CL_CHECKN(e, clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz)); ids = malloc(sz); - if (ids == NULL) FAIL(NULL, GA_MEMORY_ERROR); + if (ids == NULL) { + error_sys(e, "malloc"); + return NULL; + } err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sz, ids, NULL); + if (err != CL_SUCCESS) { + free(ids); + error_cl(e, "clContextGetInfo", err); + return NULL; + } res = ids[0]; free(ids); - CHKFAIL(NULL); return res; } @@ -116,6 +113,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { char vendor[32]; char driver_version[64]; cl_uint vendor_id; + cl_int err; size_t len; int64_t v = 0; int e = 0; @@ -126,29 +124,27 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { const char *rlk[1]; gpukernel *m; - e = setup_lib(); + e = setup_lib(global_err); if (e != GA_NO_ERROR) return NULL; - id = get_dev(ctx, NULL); + id = get_dev(ctx, global_err); if (id == NULL) return NULL; - err = clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(qprop), - &qprop, NULL); - if (err != CL_SUCCESS) return NULL; + CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES, + sizeof(qprop), &qprop, NULL)); - err = clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL); - if (err != CL_SUCCESS) - return NULL; - err = clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, - NULL); - if (err != CL_SUCCESS) - return NULL; - err = clGetDeviceInfo(id, CL_DRIVER_VERSION, sizeof(driver_version), - driver_version, NULL); - if (err != CL_SUCCESS) - return NULL; + CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor), + vendor, NULL)); + CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, + sizeof(vendor_id), &vendor_id, NULL)); + CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DRIVER_VERSION, + sizeof(driver_version), + driver_version, NULL)); res = malloc(sizeof(*res)); - if (res == NULL) return NULL; + if (res == NULL) { + error_sys(global_err, "malloc"); + return NULL; + } res->ctx = ctx; res->ops = &opencl_ops; @@ -163,6 +159,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { &err); if (res->q == NULL) { free(res); + error_cl(global_err, "clCreateCommandQueue", err); return NULL; } @@ -173,10 +170,9 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { clRetainContext(res->ctx); TAG_CTX(res); - res->errbuf = cl_alloc((gpucontext *)res, 8, &v, GA_BUFFER_INIT, &e); - if (e != GA_NO_ERROR) { + res->errbuf = cl_alloc((gpucontext *)res, 8, &v, GA_BUFFER_INIT); + if (res->errbuf == NULL) goto fail; - } res->refcnt--; /* Prevent ref loop */ /* Create per-context OpenCL preamble */ @@ -185,7 +181,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { rlk[0] = dummy_kern; len = sizeof(dummy_kern); // this dummy kernel does not require a CLUDA preamble - m = cl_newkernel((gpucontext *)res, 1, rlk, &len, "kdummy", 0, NULL, 0, &ret, NULL); + m = cl_newkernel((gpucontext *)res, 1, rlk, &len, "kdummy", 0, NULL, 0, NULL); if (m == NULL) goto fail; ret = cl_property((gpucontext *)res, NULL, m, GA_KERNEL_PROP_PREFLSIZE, &warp_size); @@ -200,8 +196,8 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { return res; -fail: - err = res->err; + fail: + error_set(global_err, res->err->code, res->err->msg); cl_free_ctx(res); return NULL; } @@ -212,16 +208,11 @@ cl_command_queue cl_get_stream(gpucontext *ctx) { } static void cl_free_ctx(cl_ctx *ctx) { - gpuarray_blas_ops *blas_ops = NULL; - ASSERT_CTX(ctx); + assert(ctx->refcnt != 0); ctx->refcnt--; if (ctx->refcnt == 0) { - if (ctx->blas_handle != NULL) { - ctx->err = cl_property((gpucontext *)ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); - blas_ops->teardown((gpucontext *)ctx); - } if (ctx->errbuf != NULL) { ctx->refcnt = 2; /* Avoid recursive release */ cl_release(ctx->errbuf); @@ -239,22 +230,29 @@ gpudata *cl_make_buf(gpucontext *c, cl_mem buf) { cl_ctx *ctx = (cl_ctx *)c; gpudata *res; cl_context buf_ctx; + cl_int err; ASSERT_CTX(ctx); - ctx->err = clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx), - &buf_ctx, NULL); - if (ctx->err != CL_SUCCESS) return NULL; - if (buf_ctx != ctx->ctx) return NULL; + CL_CHECKN(ctx->err, clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx), + &buf_ctx, NULL)); + if (buf_ctx != ctx->ctx) { + error_set(ctx->err, GA_VALUE_ERROR, "Requested context doesn't macth object context"); + return NULL; + } res = malloc(sizeof(*res)); - if (res == NULL) return NULL; + if (res == NULL) { + error_sys(ctx->err, "malloc"); + return NULL; + } res->buf = buf; res->ev = NULL; res->refcnt = 1; - ctx->err = clRetainMemObject(buf); - if (ctx->err != CL_SUCCESS) { + err = clRetainMemObject(buf); + if (err != CL_SUCCESS) { free(res); + error_cl(ctx->err, "clRetainMemObject", err); return NULL; } res->ctx = ctx; @@ -322,7 +320,7 @@ static const char CL_PREAMBLE[] = /* XXX: add complex types, quad types, and longlong */ /* XXX: add vector types */ -static const char *get_error_string(cl_int err) { +const char *cl_error_string(cl_int err) { /* OpenCL 1.0 error codes */ switch (err) { case CL_SUCCESS: return "Success!"; @@ -381,27 +379,17 @@ static const char *get_error_string(cl_int err) { static int check_ext(cl_ctx *ctx, const char *name) { cl_device_id dev; - size_t sz; - int res = 0; if (ctx->exts == NULL) { - dev = get_dev(ctx->ctx, &res); - if (dev == NULL) return res; - - ctx->err = clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, 0, NULL, &sz); - if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; - - ctx->exts = malloc(sz); - if (ctx->exts == NULL) return GA_MEMORY_ERROR; + dev = get_dev(ctx->ctx, ctx->err); + if (dev == NULL) return ctx->err->code; - ctx->err = clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, sz, ctx->exts, NULL); - if (ctx->err != CL_SUCCESS) { - free(ctx->exts); - ctx->exts = NULL; - return GA_IMPL_ERROR; - } + CL_GET_PROP(ctx->err, clGetDeviceInfo, dev, CL_DEVICE_EXTENSIONS, ctx->exts); } - return (strstr(ctx->exts, name) == NULL) ? GA_DEVSUP_ERROR : 0; + if (strstr(ctx->exts, name) == NULL) + return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Unsupported extension %s", name); + else + return GA_NO_ERROR; } static void @@ -412,7 +400,7 @@ errcb(const char *errinfo, const void *pi, size_t cb, void *u) { fprintf(stderr, "%s\n", errinfo); } -static gpucontext *cl_init(int devno, int flags, int *ret) { +static gpucontext *cl_init(int devno, int flags) { cl_device_id *ds; cl_device_id d; cl_platform_id *ps; @@ -424,49 +412,68 @@ static gpucontext *cl_init(int devno, int flags, int *ret) { }; cl_context ctx; cl_ctx *res; + cl_int err; int platno; int e; platno = devno >> 16; devno &= 0xFFFF; - e = setup_lib(); + e = setup_lib(global_err); if (e != GA_NO_ERROR) - FAIL(NULL, e); + return NULL; - err = clGetPlatformIDs(0, NULL, &nump); - CHKFAIL(NULL); + CL_CHECKN(global_err, clGetPlatformIDs(0, NULL, &nump)); - if ((unsigned int)platno >= nump || platno < 0) FAIL(NULL, GA_VALUE_ERROR); + if ((unsigned int)platno >= nump || platno < 0) { + error_set(global_err, GA_VALUE_ERROR, "Platform ID out of range"); + return NULL; + } ps = calloc(sizeof(*ps), nump); - if (ps == NULL) FAIL(NULL, GA_MEMORY_ERROR); + if (ps == NULL) { + error_sys(global_err, "calloc"); + return NULL; + } err = clGetPlatformIDs(nump, ps, NULL); /* We may get garbage on failure here but it won't matter as we will not use it */ p = ps[platno]; free(ps); - CHKFAIL(NULL); + if (err != CL_SUCCESS) { + error_cl(global_err, "clGetPlatformIDs", err); + return NULL; + } - err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd); - CHKFAIL(NULL); + CL_CHECKN(global_err, clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &numd)); - if ((unsigned int)devno >= numd || devno < 0) FAIL(NULL, GA_VALUE_ERROR); + if ((unsigned int)devno >= numd || devno < 0) { + error_set(global_err, GA_VALUE_ERROR, "Device ID out of range"); + return NULL; + } ds = calloc(sizeof(*ds), numd); - if (ds == NULL) FAIL(NULL, GA_MEMORY_ERROR); + if (ds == NULL) { + error_sys(global_err, "calloc"); + return NULL; + } err = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, numd, ds, NULL); d = ds[devno]; free(ds); - CHKFAIL(NULL); + if (err != CL_SUCCESS) { + error_cl(global_err, "clGetDeviceIDs", err); + return NULL; + } props[1] = (cl_context_properties)p; ctx = clCreateContext(props, 1, &d, errcb, NULL, &err); - CHKFAIL(NULL); + if (ctx == NULL) { + error_cl(global_err, "clCreateContext", err); + return NULL; + } res = cl_make_ctx(ctx, flags); clReleaseContext(ctx); - if (res == NULL) FAIL(NULL, GA_IMPL_ERROR); // can also be a sys_error return (gpucontext *)res; } @@ -475,17 +482,20 @@ static void cl_deinit(gpucontext *c) { cl_free_ctx((cl_ctx *)c); } -static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags, - int *ret) { +static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags) { cl_ctx *ctx = (cl_ctx *)c; gpudata *res; void *hostp = NULL; + cl_int err; cl_mem_flags clflags = CL_MEM_READ_WRITE; ASSERT_CTX(ctx); if (flags & GA_BUFFER_INIT) { - if (data == NULL) FAIL(NULL, GA_VALUE_ERROR); + if (data == NULL) { + error_set(ctx->err, GA_VALUE_ERROR, "Requested initialization, but no data provided"); + return NULL; + } hostp = data; clflags |= CL_MEM_COPY_HOST_PTR; } @@ -495,19 +505,24 @@ static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags, } if (flags & GA_BUFFER_READ_ONLY) { - if (flags & GA_BUFFER_WRITE_ONLY) FAIL(NULL, GA_VALUE_ERROR); + if (flags & GA_BUFFER_WRITE_ONLY) { + error_set(ctx->err, GA_VALUE_ERROR, "Invalid combinaison: READ_ONLY and WRITE_ONLY"); + return NULL; + } clflags &= ~CL_MEM_READ_WRITE; clflags |= CL_MEM_READ_ONLY; } if (flags & GA_BUFFER_WRITE_ONLY) { - if (flags & GA_BUFFER_READ_ONLY) FAIL(NULL, GA_VALUE_ERROR); clflags &= ~CL_MEM_READ_WRITE; clflags |= CL_MEM_WRITE_ONLY; } res = malloc(sizeof(*res)); - if (res == NULL) FAIL(NULL, GA_SYS_ERROR); + if (res == NULL) { + error_sys(ctx->err, "malloc"); + return NULL; + } res->refcnt = 1; if (size == 0) { @@ -515,11 +530,12 @@ static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags, size = 1; } - res->buf = clCreateBuffer(ctx->ctx, clflags, size, hostp, &ctx->err); + res->buf = clCreateBuffer(ctx->ctx, clflags, size, hostp, &err); res->ev = NULL; - if (ctx->err != CL_SUCCESS) { + if (err != CL_SUCCESS) { free(res); - FAIL(NULL, GA_IMPL_ERROR); + error_cl(ctx->err, "clCreateBuffer", err); + return NULL; } res->ctx = ctx; @@ -547,21 +563,29 @@ static void cl_release(gpudata *b) { } } -static int cl_share(gpudata *a, gpudata *b, int *ret) { +static int cl_share(gpudata *a, gpudata *b) { cl_ctx *ctx; cl_mem aa, bb; + cl_int err; + ASSERT_BUF(a); ASSERT_BUF(b); if (a->buf == b->buf) return 1; if (a->ctx != b->ctx) return 0; ctx = a->ctx; ASSERT_CTX(ctx); - ctx->err = clGetMemObjectInfo(a->buf, CL_MEM_ASSOCIATED_MEMOBJECT, - sizeof(aa), &aa, NULL); - CHKFAIL(-1); - ctx->err = clGetMemObjectInfo(b->buf, CL_MEM_ASSOCIATED_MEMOBJECT, - sizeof(bb), &bb, NULL); - CHKFAIL(-1); + err = clGetMemObjectInfo(a->buf, CL_MEM_ASSOCIATED_MEMOBJECT, + sizeof(aa), &aa, NULL); + if (err != CL_SUCCESS) { + error_cl(ctx->err, "clGetMemObjectInfo", err); + return -1; + } + err = clGetMemObjectInfo(b->buf, CL_MEM_ASSOCIATED_MEMOBJECT, + sizeof(bb), &bb, NULL); + if (err != CL_SUCCESS) { + error_cl(ctx->err, "clGetMemObjectInfo", err); + return -1; + } if (aa == NULL) aa = a->buf; if (bb == NULL) bb = b->buf; if (aa == bb) return 1; @@ -579,7 +603,10 @@ static int cl_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, ASSERT_BUF(dst); ASSERT_BUF(src); - if (dst->ctx != src->ctx) return GA_VALUE_ERROR; + if (dst->ctx != src->ctx) { + error_set(src->ctx->err, GA_VALUE_ERROR, "Differing contexts for source and destination"); + return error_set(dst->ctx->err, src->ctx->err->code, src->ctx->err->msg); + } ctx = dst->ctx; ASSERT_CTX(ctx); @@ -594,11 +621,8 @@ static int cl_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, if (num_ev > 0) evl = evw; - ctx->err = clEnqueueCopyBuffer(ctx->q, src->buf, dst->buf, srcoff, dstoff, - sz, num_ev, evl, &ev); - if (ctx->err != CL_SUCCESS) { - return GA_IMPL_ERROR; - } + CL_CHECK(ctx->err, clEnqueueCopyBuffer(ctx->q, src->buf, dst->buf, srcoff, + dstoff, sz, num_ev, evl, &ev)); if (src->ev != NULL) clReleaseEvent(src->ev); if (dst->ev != NULL && src != dst) @@ -628,9 +652,9 @@ static int cl_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { num_ev = 1; } - ctx->err = clEnqueueReadBuffer(ctx->q, src->buf, CL_TRUE, srcoff, sz, dst, - num_ev, evl, NULL); - if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clEnqueueReadBuffer(ctx->q, src->buf, CL_TRUE, srcoff, sz, + dst, num_ev, evl, NULL)); + if (src->ev != NULL) clReleaseEvent(src->ev); src->ev = NULL; @@ -654,9 +678,9 @@ static int cl_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { num_ev = 1; } - ctx->err = clEnqueueWriteBuffer(ctx->q, dst->buf, CL_TRUE, dstoff, sz, src, - num_ev, evl, NULL); - if (err != CL_SUCCESS) return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clEnqueueWriteBuffer(ctx->q, dst->buf, CL_TRUE, dstoff, + sz, src, num_ev, evl, NULL)); + if (dst->ev != NULL) clReleaseEvent(dst->ev); dst->ev = NULL; @@ -672,7 +696,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { gpukernel *m; cl_mem_flags fl; int type; - int r, res = GA_IMPL_ERROR; + int r, res; unsigned char val = (unsigned char)data; cl_uint pattern = (cl_uint)val & (cl_uint)val >> 8 & \ @@ -681,14 +705,14 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { ASSERT_BUF(dst); ASSERT_CTX(ctx); - ctx->err = clGetMemObjectInfo(dst->buf, CL_MEM_FLAGS, sizeof(fl), &fl, NULL); - if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetMemObjectInfo(dst->buf, CL_MEM_FLAGS, sizeof(fl), + &fl, NULL)); - if (fl & CL_MEM_READ_ONLY) return GA_READONLY_ERROR; + if (fl & CL_MEM_READ_ONLY) + return error_set(ctx->err, GA_READONLY_ERROR, "destination is read only"); - ctx->err = clGetMemObjectInfo(dst->buf, CL_MEM_SIZE, sizeof(bytes), &bytes, - NULL); - if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetMemObjectInfo(dst->buf, CL_MEM_SIZE, sizeof(bytes), + &bytes, NULL)); bytes -= offset; @@ -722,8 +746,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { "i += get_global_size(0)) {mem[i] = %u; }}", offset, n, pattern); } else { - if (check_ext(ctx, CL_SMALL)) - return GA_DEVSUP_ERROR; + GA_CHECK(check_ext(ctx, CL_SMALL)); n = bytes; r = snprintf(local_kern, sizeof(local_kern), "__kernel void kmemset(__global unsigned char *mem) {" @@ -740,8 +763,8 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { rlk[0] = local_kern; type = GA_BUFFER; - m = cl_newkernel((gpucontext *)ctx, 1, rlk, &sz, "kmemset", 1, &type, 0, &res, NULL); - if (m == NULL) return res; + m = cl_newkernel((gpucontext *)ctx, 1, rlk, &sz, "kmemset", 1, &type, 0, NULL); + if (m == NULL) return ctx->err->code; /* Cheap kernel scheduling */ res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls); @@ -766,17 +789,17 @@ static int cl_check_extensions(const char **preamble, unsigned int *count, (*count)++; } if (flags & GA_USE_SMALL) { - if (check_ext(ctx, CL_SMALL)) return GA_DEVSUP_ERROR; + GA_CHECK(check_ext(ctx, CL_SMALL)); preamble[*count] = PRAGMA CL_SMALL ENABLE; (*count)++; } if (flags & GA_USE_DOUBLE) { - if (check_ext(ctx, CL_DOUBLE)) return GA_DEVSUP_ERROR; + GA_CHECK(check_ext(ctx, CL_DOUBLE)); preamble[*count] = PRAGMA CL_DOUBLE ENABLE; (*count)++; } if (flags & GA_USE_COMPLEX) { - return GA_DEVSUP_ERROR; // for now + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex are not supported yet"); } // GA_USE_HALF should always work /* @@ -787,7 +810,7 @@ static int cl_check_extensions(const char **preamble, unsigned int *count, } */ if (flags & GA_USE_CUDA) { - return GA_DEVSUP_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Cuda kernels not supported on opencl devices"); } return GA_NO_ERROR; } @@ -795,8 +818,7 @@ static int cl_check_extensions(const char **preamble, unsigned int *count, static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, - const int *types, int flags, int *ret, - char **err_str) { + const int *types, int flags, char **err_str) { cl_ctx *ctx = (cl_ctx *)c; gpukernel *res; cl_device_id dev; @@ -806,38 +828,47 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, const char *preamble[5]; size_t *newl = NULL; const char **news = NULL; + cl_int err; unsigned int n = 0; - int error; strb debug_msg = STRB_STATIC_INIT; size_t log_size; ASSERT_CTX(ctx); - if (count == 0) FAIL(NULL, GA_VALUE_ERROR); + if (count == 0) { + error_set(ctx->err, GA_VALUE_ERROR, "Empty kernel source list"); + return NULL; + } - dev = get_dev(ctx->ctx, ret); + dev = get_dev(ctx->ctx, ctx->err); if (dev == NULL) return NULL; if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive - if (flags & ~GA_USE_BINARY) - FAIL(NULL, GA_INVALID_ERROR); + if (flags & ~GA_USE_BINARY) { + error_set(ctx->err, GA_INVALID_ERROR, "Cannot combine GA_USE_BINARY with any other flag"); + return NULL; + } // We need the length for binary data and there is only one blob. - if (count != 1 || lengths == NULL || lengths[0] == 0) - FAIL(NULL, GA_VALUE_ERROR); - p = clCreateProgramWithBinary(ctx->ctx, 1, &dev, lengths, (const unsigned char **)strings, NULL, &ctx->err); - if (ctx->err != CL_SUCCESS) { - FAIL(NULL, GA_IMPL_ERROR); + if (count != 1 || lengths == NULL || lengths[0] == 0) { + error_set(ctx->err, GA_VALUE_ERROR, "GA_USE_BINARY requires the length to be specified"); + return NULL; + } + p = clCreateProgramWithBinary(ctx->ctx, 1, &dev, lengths, (const unsigned char **)strings, NULL, &err); + if (err != CL_SUCCESS) { + error_cl(ctx->err, "clCreateProgramWithBinary", err); + return NULL; } } else { - error = cl_check_extensions(preamble, &n, flags, ctx); - if (error != GA_NO_ERROR) FAIL(NULL, error); + if (cl_check_extensions(preamble, &n, flags, ctx)) + return NULL; if (n != 0) { news = calloc(count+n, sizeof(const char *)); if (news == NULL) { - FAIL(NULL, GA_SYS_ERROR); + error_sys(ctx->err, "calloc"); + return NULL; } memcpy(news, preamble, n*sizeof(const char *)); memcpy(news+n, strings, count*sizeof(const char *)); @@ -847,7 +878,8 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, newl = calloc(count+n, sizeof(size_t)); if (newl == NULL) { free(news); - FAIL(NULL, GA_MEMORY_ERROR); + error_sys(ctx->err, "calloc"); + return NULL; } memcpy(newl+n, lengths, count*sizeof(size_t)); } @@ -856,19 +888,20 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, newl = (size_t *)lengths; } - p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &ctx->err); + p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &err); if (ctx->err != CL_SUCCESS) { if (n != 0) { free(news); free(newl); } - FAIL(NULL, GA_IMPL_ERROR); + error_cl(ctx->err, "clCreateProgramWithSource", err); + return NULL; } } - ctx->err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); - if (ctx->err != CL_SUCCESS) { - if (ctx->err == CL_BUILD_PROGRAM_FAILURE && err_str!=NULL) { + err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); + if (err != CL_SUCCESS) { + if (err == CL_BUILD_PROGRAM_FAILURE && err_str != NULL) { *err_str = NULL; // Fallback, in case there's an error // We're substituting debug_msg for a string with this first line: @@ -891,7 +924,7 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, strb_append0(&debug_msg); // Make sure a final '\0' is present - if(!strb_error(&debug_msg)) { // Make sure the strb is in a valid state + if (!strb_error(&debug_msg)) { // Make sure the strb is in a valid state *err_str = memdup(debug_msg.s, debug_msg.l); // If there's a memory alloc error, fall-through : announcing a compile error is more important } @@ -904,7 +937,8 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, free(news); free(newl); } - FAIL(NULL, GA_IMPL_ERROR); + error_cl(ctx->err, "clBuildProgram", err); + return NULL; } if (n != 0) { @@ -913,32 +947,38 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, } res = malloc(sizeof(*res)); - if (res == NULL) FAIL(NULL, GA_MEMORY_ERROR); + if (res == NULL) { + error_sys(ctx->err, "malloc"); + return NULL; + } res->refcnt = 1; res->ev = NULL; res->argcount = argcount; - res->k = clCreateKernel(p, fname, &ctx->err); + res->k = clCreateKernel(p, fname, &err); res->types = NULL; /* This avoids a crash in cl_releasekernel */ res->evr = NULL; /* This avoids a crash in cl_releasekernel */ res->ctx = ctx; ctx->refcnt++; clReleaseProgram(p); TAG_KER(res); - if (ctx->err != CL_SUCCESS) { + if (err != CL_SUCCESS) { cl_releasekernel(res); - FAIL(NULL, GA_IMPL_ERROR); + error_cl(ctx->err, "clCreateKernel", err); + return NULL; } res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { cl_releasekernel(res); - FAIL(NULL, GA_IMPL_ERROR); + error_sys(ctx->err, "calloc"); + return NULL; } memcpy(res->types, types, argcount * sizeof(int)); res->evr = calloc(argcount, sizeof(cl_event *)); if (res->evr == NULL) { cl_releasekernel(res); - FAIL(NULL, GA_IMPL_ERROR); + error_sys(ctx->err, "calloc"); + return NULL; } return res; @@ -971,29 +1011,26 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) { cl_long stemp; switch (k->types[i]) { case GA_POINTER: - return GA_DEVSUP_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Cannot set raw pointers as kernel arguments"); case GA_BUFFER: btmp = (gpudata *)a; - ctx->err = clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf); + CL_CHECK(ctx->err, clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf)); k->evr[i] = &btmp->ev; break; case GA_SIZE: temp = *((size_t *)a); - ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_ULONG), &temp); + CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_ULONG), &temp)); k->evr[i] = NULL; break; case GA_SSIZE: stemp = *((ssize_t *)a); - ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_LONG), &stemp); + CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(GA_LONG), &stemp)); k->evr[i] = NULL; break; default: - ctx->err = clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]), a); + CL_CHECK(ctx->err, clSetKernelArg(k->k, i, gpuarray_get_elsize(k->types[i]), a)); k->evr[i] = NULL; } - if (ctx->err != CL_SUCCESS) { - return GA_IMPL_ERROR; - } return GA_NO_ERROR; } @@ -1007,33 +1044,31 @@ static int cl_callkernel(gpukernel *k, unsigned int n, cl_device_id dev; cl_uint num_ev; cl_uint i; - int res = 0; + cl_int err; ASSERT_KER(k); ASSERT_CTX(ctx); if (n > 3) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Call with more than 3 dimensions"); - dev = get_dev(ctx->ctx, &res); - if (dev == NULL) return res; + dev = get_dev(ctx->ctx, ctx->err); + if (dev == NULL) return ctx->err->code; if (args != NULL) { for (i = 0; i < k->argcount; i++) { - err = cl_setkernelarg(k, i, args[i]); - if (err != GA_NO_ERROR) return err; + GA_CHECK(cl_setkernelarg(k, i, args[i])); } } if (shared != 0) { // the shared memory pointer must be the last argument - ctx->err = clSetKernelArg(k->k, k->argcount, shared, NULL); - if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clSetKernelArg(k->k, k->argcount, shared, NULL)); } evw = calloc(sizeof(cl_event), k->argcount); if (evw == NULL) { - return GA_MEMORY_ERROR; + return error_sys(ctx->err, "calloc"); } num_ev = 0; @@ -1056,10 +1091,11 @@ static int cl_callkernel(gpukernel *k, unsigned int n, case 1: _gs[0] = gs[0] * ls[0]; } - ctx->err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls, + err = clEnqueueNDRangeKernel(ctx->q, k->k, n, NULL, _gs, ls, num_ev, evw, &ev); free(evw); - if (ctx->err != CL_SUCCESS) return GA_IMPL_ERROR; + if (err != CL_SUCCESS) + return error_cl(ctx->err, "clEnqueueNDRangeKernel", err); for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { @@ -1081,23 +1117,20 @@ static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) { cl_program p; size_t rsz; void *res; + cl_int err; ASSERT_KER(k); ASSERT_CTX(ctx); - ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL)); + CL_CHECK(ctx->err, clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL)); res = malloc(rsz); if (res == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); + return error_sys(ctx->err, "malloc"); + err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); if (ctx->err != CL_SUCCESS) { free(res); - return GA_IMPL_ERROR; + return error_cl(ctx->err, "clProgramGetInfo", err); } *sz = rsz; *obj = res; @@ -1111,9 +1144,7 @@ static int cl_sync(gpudata *b) { ASSERT_CTX(ctx); if (b->ev != NULL) { - ctx->err = clWaitForEvents(1, &b->ev); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clWaitForEvents(1, &b->ev)); clReleaseEvent(b->ev); b->ev = NULL; } @@ -1125,7 +1156,7 @@ static int cl_transfer(gpudata *dst, size_t dstoff, ASSERT_BUF(dst); ASSERT_BUF(src); - return GA_UNSUPPORTED_ERROR; + return error_set(dst->ctx->err, GA_UNSUPPORTED_ERROR, "Operation not supported"); } extern gpuarray_blas_ops clblas_ops; @@ -1147,13 +1178,13 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, if (prop_id < GA_BUFFER_PROP_START) { if (ctx == NULL) - return GA_VALUE_ERROR; + return error_set(global_err, GA_VALUE_ERROR, "Requesting context property with no context"); } else if (prop_id < GA_KERNEL_PROP_START) { if (buf == NULL) - return GA_VALUE_ERROR; + return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Requesting buffer property with no buffer"); } else { if (k == NULL) - return GA_VALUE_ERROR; + return error_set(ctx ? ctx->err : global_err, GA_VALUE_ERROR, "Requesting kernel property with no kernel"); } switch (prop_id) { @@ -1163,75 +1194,47 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, cl_uint ui; case GA_CTX_PROP_DEVNAME: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_NAME, 256, (char *)res, NULL); - return (ctx->err != CL_SUCCESS) ? GA_IMPL_ERROR : GA_NO_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_NAME, 256, (char *)res, + NULL)); + return GA_NO_ERROR; case GA_CTX_PROP_PCIBUSID: /* For the moment, PCI Bus ID is not supported for OpenCL. */ - return GA_DEVSUP_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Can't get PCI bus ID on OpenCL"); case GA_CTX_PROP_MAXLSIZE: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL, - &sz); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - psz = malloc(sz); - if (psz == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL); - if (ctx->err != CL_SUCCESS) { - free(psz); - return GA_IMPL_ERROR; - } + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, psz); *((size_t *)res) = psz[0]; free(psz); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(sz), &sz, - NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(ui), - &ui, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(ui), &ui, NULL)); *((unsigned int *)res) = ui; return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, - NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_ADDRESS_BITS, sizeof(ui), &ui, - NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(sz), - &sz, NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_ADDRESS_BITS, sizeof(ui), + &ui, NULL)); + CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(sz), &sz, NULL)); if (ui == 32) { sz = 4294967295UL/sz; } else if (ui == 64) { @@ -1245,11 +1248,11 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, case GA_CTX_PROP_BLAS_OPS: { int e; - if ((e = load_libclblas()) == GA_NO_ERROR) { + if ((e = load_libclblas(ctx->err)) == GA_NO_ERROR) { *((gpuarray_blas_ops **)res) = &clblas_ops; return e; } - if ((e = load_libclblast()) == GA_NO_ERROR) { + if ((e = load_libclblast(ctx->err)) == GA_NO_ERROR) { *((gpuarray_blas_ops **)res) = &clblast_ops; return e; } @@ -1260,7 +1263,7 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, // TODO Complete in the future whenif a multi-gpu collectives API for // opencl appears *((void **)res) = NULL; - return GA_DEVSUP_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives operations not supported on OpenCL"); case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; @@ -1271,14 +1274,10 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, - NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(sz), &sz, - NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_GLOBAL_MEM_SIZE, + sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; @@ -1286,14 +1285,10 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, /* There is no way to query free memory so we just return the largest block size */ case GA_CTX_PROP_LARGEST_MEMBLOCK: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, - NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(sz), - &sz, NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; @@ -1320,64 +1315,28 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE0: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL, - &sz); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - psz = malloc(sz); - if (psz == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL); - if (ctx->err != CL_SUCCESS) { - free(psz); - return GA_IMPL_ERROR; - } + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, + psz); *((size_t *)res) = psz[0]; free(psz); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE1: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL, - &sz); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - psz = malloc(sz); - if (psz == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL); - if (ctx->err != CL_SUCCESS) { - free(psz); - return GA_IMPL_ERROR; - } + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, + psz); *((size_t *)res) = psz[1]; free(psz); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE2: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, 0, NULL, - &sz); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - psz = malloc(sz); - if (psz == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sz, psz, NULL); - if (ctx->err != CL_SUCCESS) { - free(psz); - return GA_IMPL_ERROR; - } + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, + psz); *((size_t *)res) = psz[2]; free(psz); return GA_NO_ERROR; @@ -1387,10 +1346,8 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: - ctx->err = clGetMemObjectInfo(buf->buf, CL_MEM_SIZE, sizeof(sz), &sz, - NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetMemObjectInfo(buf->buf, CL_MEM_SIZE, sizeof(sz), + &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; @@ -1401,27 +1358,20 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; - ctx->err = clGetKernelWorkGroupInfo(k->k, id, CL_KERNEL_WORK_GROUP_SIZE, - sizeof(sz), &sz, NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetKernelWorkGroupInfo(k->k, id, + CL_KERNEL_WORK_GROUP_SIZE, + sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: - ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), - &id, NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; - ctx->err = clGetKernelWorkGroupInfo(k->k, id, - CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, - sizeof(sz), &sz, NULL); - if (ctx->err != GA_NO_ERROR) - return GA_IMPL_ERROR; + CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, + sizeof(id), &id, NULL)); + CL_CHECK(ctx->err, clGetKernelWorkGroupInfo(k->k, id, + CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, + sizeof(sz), &sz, NULL)); *((size_t *)res) = sz; return GA_NO_ERROR; @@ -1434,17 +1384,17 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, return GA_NO_ERROR; default: - return GA_INVALID_ERROR; + return error_fmt(ctx->err, GA_INVALID_ERROR, "Invalid property: %d", prop_id); } } static const char *cl_error(gpucontext *c) { cl_ctx *ctx = (cl_ctx *)c; if (ctx == NULL){ - return get_error_string(err); - }else{ + return global_err->msg; + } else { ASSERT_CTX(ctx); - return get_error_string(ctx->err); + return ctx->err->msg; } } diff --git a/src/private.h b/src/private.h index 1a1584dfb1..8aa4ec3c99 100644 --- a/src/private.h +++ b/src/private.h @@ -82,7 +82,7 @@ struct _gpuarray_buffer_ops { gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags); void (*buffer_retain)(gpudata *b); void (*buffer_release)(gpudata *b); - int (*buffer_share)(gpudata *a, gpudata *b, int *ret); + int (*buffer_share)(gpudata *a, gpudata *b); int (*buffer_move)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); int (*buffer_read)(void *dst, gpudata *src, size_t srcoff, size_t sz); diff --git a/src/private_opencl.h b/src/private_opencl.h index 2a523f5bda..a0d2620917 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -30,13 +30,46 @@ #define CLEAR(o) #endif +const char *cl_error_string(cl_int); + +static inline int error_cl(error *e, const char *msg, cl_int err) { + return error_fmt(e, GA_IMPL_ERROR, "%s: %s", msg, cl_error_string(err)); +} + +#define CL_CHECK(e, cmd) do { \ + cl_int err = (cmd); \ + if (err != CL_SUCCESS) \ + return error_cl(e, #cmd, err); \ + } while(0) + +#define CL_CHECKN(e, cmd) do { \ + cl_int err = (cmd); \ + if (err != CL_SUCCESS) { \ + error_cl(e, #cmd, err); \ + return NULL; \ + } \ + } while(0) + +#define CL_GET_PROP(e, fn, obj, prop, val) do { \ + size_t sz; \ + cl_int err; \ + CL_CHECK(e, fn (obj, prop, 0, NULL, &sz)); \ + val = malloc(sz); \ + if (val == NULL) return error_sys(e, "malloc"); \ + err = fn (obj, prop, sz, val, NULL); \ + if (err != CL_SUCCESS) { \ + free(val); \ + val = NULL; \ + return error_cl(e, #fn, err); \ + } \ + } while(0) + typedef struct _cl_ctx { GPUCONTEXT_HEAD; cl_context ctx; cl_command_queue q; char *exts; char *preamble; - cl_int err; } cl_ctx; STATIC_ASSERT(sizeof(cl_ctx) <= sizeof(gpucontext), sizeof_struct_gpucontext_cl); From a0694f9efafd1a438ed36708f7b550793a794afb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 15:26:21 -0400 Subject: [PATCH 259/597] Error messages for clblas --- src/gpuarray_blas_opencl_clblas.c | 297 ++++++++++++------------------ src/loaders/libclblas.h | 18 ++ 2 files changed, 133 insertions(+), 182 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index f6e51429b1..ab0e020ffd 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -34,13 +34,64 @@ static inline clblasTranspose convT(cb_transpose trans) { static unsigned int refcnt = 0; -static int setup(gpucontext *ctx) { - clblasStatus err; +static const char *estr(clblasStatus err) { + if (err > -1024) + return cl_error_string((cl_int)err); + switch (err) { + case clblasNotImplemented: + return "Unimplemented feature"; + case clblasNotInitialized: + return "Library not initialized"; + case clblasInvalidMatA: + return "matrix A is not a valid memory object"; + case clblasInvalidMatB: + return "matrix B is not a valid memory object"; + case clblasInvalidMatC: + return "matrix C is not a valid memory object"; + case clblasInvalidVecX: + return "vector X is not a valid memory object"; + case clblasInvalidVecY: + return "vector Y is not a valid memory object"; + case clblasInvalidDim: + return "An input dimension (M, N, K) is invalid"; + case clblasInvalidLeadDimA: + return "leading dimension for A must not be less than the size of the first dimension"; + case clblasInvalidLeadDimB: + return "leading dimension for B must not be less than the size of the second dimension"; + case clblasInvalidLeadDimC: + return "leading dimension for C must not be less than the size of the third dimension"; + case clblasInvalidIncX: + return "increment for X must not be 0"; + case clblasInvalidIncY: + return "increment for Y must not be 0"; + case clblasInsufficientMemMatA: + return "memory object for matrix A is too small"; + case clblasInsufficientMemMatB: + return "memory object for matrix B is too small"; + case clblasInsufficientMemMatC: + return "memory object for matrix C is too small"; + case clblasInsufficientMemVecX: + return "memory object for vector X is too small"; + case clblasInsufficientMemVecY: + return "memory object for vector Y is too small"; + default: + return "Unknow error"; + } +} +static inline int error_clblas(error *e, const char *msg, clblasStatus err) { + return error_fmt(e, GA_BLAS_ERROR, "%s: %s", msg, estr(err)); +} + +#define CLB_CHECK(e, cmd) do { \ + clblasStatus err = (cmd); \ + if (err != clblasSuccess) \ + return error_clblas(e, #cmd, err); \ + } while (0) + +static int setup(gpucontext *ctx) { if (refcnt == 0) { - err = clblasSetup(); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasSetup()); } if (ctx->blas_handle == NULL) @@ -58,10 +109,6 @@ static void teardown(gpucontext *ctx) { clblasTeardown(); } -static const char *error(gpucontext *ctx) { - return "(clblas) error in blas call, no details for now."; -} - #define ARRAY_INIT(A) \ if (A->ev != NULL) \ evl[num_ev++] = A->ev @@ -72,15 +119,6 @@ static const char *error(gpucontext *ctx) { A->ev = ev; \ clRetainEvent(A->ev) -static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **B, size_t *offB, size_t ldb, - float beta, gpudata **C, size_t *offC, size_t ldc, - size_t batchCount) { - return GA_DEVSUP_ERROR; -} - static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, @@ -92,18 +130,17 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_event ev; size_t i; cl_uint num_ev = 0; - clblasStatus err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); - err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), + M, N, K, + alpha, A[i]->buf, offA[i], lda, + B[i]->buf, offB[i], ldb, + beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); @@ -124,18 +161,17 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_event ev; size_t i; cl_uint num_ev = 0; - clblasStatus err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); - err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB), + M, N, K, + alpha, A[i]->buf, offA[i], lda, + B[i]->buf, offB[i], ldb, + beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); @@ -145,65 +181,6 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } -static int hgemvBatch(cb_order order, cb_transpose transA, - size_t M, size_t N, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int sgemvBatch(cb_order order, cb_transpose transA, - size_t M, size_t N, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int dgemvBatch(cb_order order, cb_transpose transA, - size_t M, size_t N, double alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, - gpudata **A, size_t *offA, size_t lda, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, - gpudata **A, size_t *offA, size_t lda, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, - gpudata **A, size_t *offA, size_t lda, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int hdot( - size_t N, - gpudata *X, size_t offX, size_t incX, - gpudata *Y, size_t offY, size_t incY, - gpudata *Z, size_t offZ) { - return GA_DEVSUP_ERROR; -} - static int sdot( size_t N, gpudata *X, size_t offX, size_t incX, @@ -215,13 +192,11 @@ static int sdot( cl_event evl[3]; cl_event ev; gpudata *wbuf; - int alloc_err; - wbuf = opencl_ops.buffer_alloc( - (gpucontext*)ctx, - N*sizeof(float), NULL, GA_BUFFER_READ_WRITE, &alloc_err); - if (alloc_err != GA_NO_ERROR) - return alloc_err; + wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx, + N*sizeof(float), NULL, GA_BUFFER_READ_WRITE); + if (wbuf == NULL) + return ctx->err->code; ARRAY_INIT(X); ARRAY_INIT(Y); @@ -234,14 +209,13 @@ static int sdot( Y->buf, offY, incY, wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); + opencl_ops.buffer_release(wbuf); if (err != clblasSuccess) - return GA_BLAS_ERROR; + return error_clblas(ctx->err, "clblasSdot", err); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); - - opencl_ops.buffer_release(wbuf); clReleaseEvent(ev); return GA_NO_ERROR; @@ -258,13 +232,11 @@ static int ddot( cl_event evl[3]; cl_event ev; gpudata *wbuf; - int alloc_err; - wbuf = opencl_ops.buffer_alloc( - (gpucontext*)ctx, - N*sizeof(double), NULL, GA_BUFFER_READ_WRITE, &alloc_err); - if (alloc_err != GA_NO_ERROR) - return alloc_err; + wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx, + N*sizeof(double), NULL, GA_BUFFER_READ_WRITE); + if (wbuf == NULL) + return ctx->err->code; ARRAY_INIT(X); ARRAY_INIT(Y); @@ -276,32 +248,24 @@ static int ddot( Y->buf, offY, incY, wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); + opencl_ops.buffer_release(wbuf); if (err != clblasSuccess) - return GA_BLAS_ERROR; + return error_clblas(ctx->err, "clblasDdot", err); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); - opencl_ops.buffer_release(wbuf); clReleaseEvent(ev); return GA_NO_ERROR; } -static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, - float alpha, gpudata *A, size_t offA, size_t lda, - gpudata *X, size_t offX, int incX, float beta, - gpudata *Y, size_t offY, int incY) { - return GA_DEVSUP_ERROR; -} - static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; - clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; @@ -310,12 +274,10 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, ARRAY_INIT(X); ARRAY_INIT(Y); - err = clblasSgemv(convO(order), convT(transA), M, N, alpha, - A->buf, offA, lda, X->buf, offX, incX, - beta, Y->buf, offY, incY, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasSgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); @@ -331,7 +293,6 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; - clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; @@ -340,12 +301,10 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, ARRAY_INIT(X); ARRAY_INIT(Y); - err = clblasDgemv(convO(order), convT(transA), M, N, alpha, - A->buf, offA, lda, X->buf, offX, incX, - beta, Y->buf, offY, incY, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasDgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); @@ -356,21 +315,12 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, return GA_NO_ERROR; } -static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t offA, size_t lda, - gpudata *B, size_t offB, size_t ldb, float beta, - gpudata *C, size_t offC, size_t ldc) { - return GA_DEVSUP_ERROR; -} - static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; - clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; @@ -379,12 +329,11 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(B); ARRAY_INIT(C); - err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A->buf, offA, lda, B->buf, offB, ldb, - beta, C->buf, offC, ldc, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), + M, N, K, + alpha, A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); @@ -401,7 +350,6 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; - clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; @@ -410,12 +358,11 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(B); ARRAY_INIT(C); - err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A->buf, offA, lda, B->buf, offB, ldb, - beta, C->buf, offC, ldc, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB), + M, N, K, + alpha, A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); @@ -426,13 +373,6 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } -static int hger(cb_order order, size_t M, size_t N, float alpha, - gpudata *X, size_t offX, int incX, - gpudata *Y, size_t offY, int incY, - gpudata *A, size_t offA, size_t lda) { - return GA_DEVSUP_ERROR; -} - static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, @@ -441,17 +381,14 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, cl_event evl[3]; cl_event ev; cl_uint num_ev = 0; - clblasStatus err; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); - err = clblasSger(convO(order), M, N, alpha, X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasSger(convO(order), M, N, alpha, X->buf, offX, incX, + Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -470,17 +407,14 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, cl_event evl[3]; cl_event ev; cl_uint num_ev = 0; - clblasStatus err; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); - err = clblasDger(convO(order), M, N, alpha, X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, - num_ev, num_ev == 0 ? NULL : evl, &ev); - if (err != clblasSuccess) - return GA_BLAS_ERROR; + CLB_CHECK(ctx->err, clblasDger(convO(order), M, N, alpha, X->buf, offX, incX, + Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q, + num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -494,26 +428,25 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpuarray_blas_ops clblas_ops = { setup, teardown, - error, - hdot, /* TODO */ + NULL, /* hdot */ sdot, ddot, - hgemv, /* TODO */ + NULL, /* hgemv */ sgemv, dgemv, - hgemm, /* TODO */ + NULL, /* hgemm */ sgemm, dgemm, - hger, /* TODO */ + NULL, /* hger */ sger, dger, - hgemmBatch, /* TODO */ + NULL, /* hgemmBatch */ sgemmBatch, dgemmBatch, - hgemvBatch, /* TODO */ - sgemvBatch, /* TODO */ - dgemvBatch, /* TODO */ - hgerBatch, /* TODO */ - sgerBatch, /* TODO */ - dgerBatch, /* TODO */ + NULL, /* hgemvBatch */ + NULL, /* sgemvBatch */ + NULL, /* dgemvBatch */ + NULL, /* hgerBatch */ + NULL, /* sgerBatch */ + NULL, /* dgerBatch */ }; diff --git a/src/loaders/libclblas.h b/src/loaders/libclblas.h index 483f1fa87b..ccdee19983 100644 --- a/src/loaders/libclblas.h +++ b/src/loaders/libclblas.h @@ -18,6 +18,24 @@ typedef enum clblasTranspose_ { typedef enum clblasStatus_ { clblasSuccess = CL_SUCCESS, /* Rest is not exposed from here */ + clblasNotImplemented = -1024, + clblasNotInitialized, + clblasInvalidMatA, + clblasInvalidMatB, + clblasInvalidMatC, + clblasInvalidVecX, + clblasInvalidVecY, + clblasInvalidDim, + clblasInvalidLeadDimA, + clblasInvalidLeadDimB, + clblasInvalidLeadDimC, + clblasInvalidIncX, + clblasInvalidIncY, + clblasInsufficientMemMatA, + clblasInsufficientMemMatB, + clblasInsufficientMemMatC, + clblasInsufficientMemVecX, + clblasInsufficientMemVecY, } clblasStatus; int load_libclblas(error *); From bbd54a80f62b5f56e33d9aa1137c2ee87947bd4e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 16:12:48 -0400 Subject: [PATCH 260/597] Add error code for clblast. --- src/gpuarray_blas_opencl_clblast.c | 290 +++++++++++++---------------- src/loaders/libclblast.fn | 24 +-- src/loaders/libclblast.h | 30 ++- 3 files changed, 172 insertions(+), 172 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index c6fd010a3b..13fca18190 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -30,6 +30,76 @@ static inline Transpose convT(cb_transpose trans) { } } +static const char *estr(CLBlastStatusCode err) { + if (err > -1024) + return cl_error_string((cl_int)err); + switch (err) { + case CLBlastNotImplemented: + return "Unimplemented feature"; + case CLBlastInvalidMatrixA: + return "matrix A is not a valid memory object"; + case CLBlastInvalidMatrixB: + return "matrix B is not a valid memory object"; + case CLBlastInvalidMatrixC: + return "matrix C is not a valid memory object"; + case CLBlastInvalidVectorX: + return "vector X is not a valid memory object"; + case CLBlastInvalidVectorY: + return "vector Y is not a valid memory object"; + case CLBlastInvalidDimension: + return "An input dimension (M, N, K) is invalid"; + case CLBlastInvalidLeadDimA: + return "leading dimension for A must not be less than the size of the first dimension"; + case CLBlastInvalidLeadDimB: + return "leading dimension for B must not be less than the size of the second dimension"; + case CLBlastInvalidLeadDimC: + return "leading dimension for C must not be less than the size of the third dimension"; + case CLBlastInvalidIncrementX: + return "increment for X must not be 0"; + case CLBlastInvalidIncrementY: + return "increment for Y must not be 0"; + case CLBlastInsufficientMemoryA: + return "memory object for matrix A is too small"; + case CLBlastInsufficientMemoryB: + return "memory object for matrix B is too small"; + case CLBlastInsufficientMemoryC: + return "memory object for matrix C is too small"; + case CLBlastInsufficientMemoryX: + return "memory object for vector X is too small"; + case CLBlastInsufficientMemoryY: + return "memory object for vector Y is too small"; + case CLBlastInvalidLocalMemUsage: + return "not enough local memory on the device"; + case CLBlastNoHalfPrecision: + return "float16 is not supported on this device"; + case CLBlastNoDoublePrecision: + return "float64 is not supported on this device"; + case CLBlastInvalidVectorScalar: + return "unit-sized vector is not a valid memory object"; + case CLBlastInsufficientMemoryScalar: + return "memory object for unit-sized vector is too small"; + case CLBlastDatabaseError: + return "device entry not in database"; + case CLBlastUnknownError: + return "Unspecified error"; + case CLBlastUnexpectedError: + return "Unexpected error"; + default: + return "Unknow error"; + } +} + +static inline int error_clblast(error *e, const char *msg, + CLBlastStatusCode err) { + return error_fmt(e, GA_BLAS_ERROR, "%s: %s", msg, estr(err)); +} + +#define CLBT_CHECK(e, cmd) do { \ + CLBlastStatusCode err = (cmd); \ + if (err != kSuccess) \ + return error_clblast(e, #cmd, err); \ + } while (0) + static int setup(gpucontext *ctx) { return GA_NO_ERROR; } @@ -37,10 +107,6 @@ static int setup(gpucontext *ctx) { static void teardown(gpucontext *ctx) { } -static const char *error(gpucontext *ctx) { - return "(clblast) error in blas call, no details for now."; -} - #define ARRAY_INIT(A) \ if (A->ev != NULL) \ clWaitForEvents(1, &A->ev) @@ -60,17 +126,18 @@ static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; - StatusCode err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); - err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, - float_to_half(alpha), A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - float_to_half(beta), C[i]->buf, offC[i], ldc, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastHgemm(convO(order), convT(transA), + convT(transB), M, N, K, + float_to_half(alpha), + A[i]->buf, offA[i], lda, + B[i]->buf, offB[i], ldb, + float_to_half(beta), + C[i]->buf, offC[i], ldc, &ctx->q, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); @@ -89,17 +156,16 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; - StatusCode err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); - err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastSgemm(convO(order), convT(transA), + convT(transB), M, N, K, + alpha, A[i]->buf, offA[i], lda, + B[i]->buf, offB[i], ldb, beta, + C[i]->buf, offC[i], ldc, &ctx->q, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); @@ -118,17 +184,16 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; - StatusCode err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); - err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), + convT(transB), M, N, K, + alpha, A[i]->buf, offA[i], lda, + B[i]->buf, offB[i], ldb, beta, + C[i]->buf, offC[i], ldc, &ctx->q, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); @@ -138,78 +203,20 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return GA_NO_ERROR; } -static int hgemvBatch(cb_order order, cb_transpose transA, - size_t M, size_t N, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int sgemvBatch(cb_order order, cb_transpose transA, - size_t M, size_t N, float alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - float beta, gpudata **y, size_t *offY, size_t incY, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int dgemvBatch(cb_order order, cb_transpose transA, - size_t M, size_t N, double alpha, - gpudata **A, size_t *offA, size_t lda, - gpudata **x, size_t *offX, size_t incX, - double beta, gpudata **y, size_t *offY, size_t incY, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int hgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, - gpudata **A, size_t *offA, size_t lda, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, - gpudata **A, size_t *offA, size_t lda, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - -static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, - gpudata **x, size_t *offX, size_t incX, - gpudata **y, size_t *offY, size_t incY, - gpudata **A, size_t *offA, size_t lda, - size_t batchCount, int flags) { - return GA_DEVSUP_ERROR; -} - static int hdot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); - err = CLBlastHdot( - N, - Z->buf, offZ, - X->buf, offX, incX, - Y->buf, offY, incY, - &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastHdot(N, Z->buf, offZ, X->buf, offX, incX, + Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -226,21 +233,14 @@ static int sdot( gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); - err = CLBlastSdot( - N, - Z->buf, offZ, - X->buf, offX, incX, - Y->buf, offY, incY, - &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastSdot(N, Z->buf, offZ, X->buf, offX, incX, + Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -257,21 +257,14 @@ static int ddot( gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); - err = CLBlastDdot( - N, - Z->buf, offZ, - X->buf, offX, incX, - Y->buf, offY, incY, - &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastDdot(N, Z->buf, offZ, X->buf, offX, incX, + Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -287,18 +280,17 @@ static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); - err = CLBlastHgemv(convO(order), convT(transA), M, N, float_to_half(alpha), - A->buf, offA, lda, X->buf, offX, incX, - float_to_half(beta), Y->buf, offY, incY, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastHgemv(convO(order), convT(transA), M, N, + float_to_half(alpha), + A->buf, offA, lda, X->buf, offX, incX, + float_to_half(beta), + Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); @@ -314,18 +306,15 @@ static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); - err = CLBlastSgemv(convO(order), convT(transA), M, N, alpha, - A->buf, offA, lda, X->buf, offX, incX, - beta, Y->buf, offY, incY, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastSgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); @@ -341,18 +330,15 @@ static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); - err = CLBlastDgemv(convO(order), convT(transA), M, N, alpha, - A->buf, offA, lda, X->buf, offX, incX, - beta, Y->buf, offY, incY, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastDgemv(convO(order), convT(transA), M, N, alpha, + A->buf, offA, lda, X->buf, offX, incX, + beta, Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); @@ -369,18 +355,17 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); - err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, - float_to_half(alpha), A->buf, offA, lda, B->buf, offB, ldb, - float_to_half(beta), C->buf, offC, ldc, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastHgemm(convO(order), convT(transA), convT(transB), + M, N, K, float_to_half(alpha), + A->buf, offA, lda, B->buf, offB, ldb, + float_to_half(beta), C->buf, offC, ldc, + &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); @@ -397,18 +382,16 @@ static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); - err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A->buf, offA, lda, B->buf, offB, ldb, - beta, C->buf, offC, ldc, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastSgemm(convO(order), convT(transA), convT(transB), + M, N, K, alpha, + A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); @@ -425,18 +408,16 @@ static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; - StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); - err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, - alpha, A->buf, offA, lda, B->buf, offB, ldb, - beta, C->buf, offC, ldc, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB), + M, N, K, alpha, + A->buf, offA, lda, B->buf, offB, ldb, + beta, C->buf, offC, ldc, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); @@ -453,16 +434,14 @@ static int hger(cb_order order, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; - StatusCode err; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); - err = CLBlastHger(convO(order), M, N, float_to_half(alpha), X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastHger(convO(order), M, N, float_to_half(alpha), + X->buf, offX, incX, Y->buf, offY, incY, + A->buf, offA, lda, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -479,16 +458,14 @@ static int sger(cb_order order, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; - StatusCode err; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); - err = CLBlastSger(convO(order), M, N, alpha, X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastSger(convO(order), M, N, alpha, + X->buf, offX, incX, Y->buf, offY, incY, + A->buf, offA, lda, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -505,16 +482,14 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda) { cl_ctx *ctx = X->ctx; cl_event ev; - StatusCode err; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(A); - err = CLBlastDger(convO(order), M, N, alpha, X->buf, offX, incX, - Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev); - if (err != kSuccess) - return GA_BLAS_ERROR; + CLBT_CHECK(ctx->err, CLBlastDger(convO(order), M, N, alpha, + X->buf, offX, incX, Y->buf, offY, incY, + A->buf, offA, lda, &ctx->q, &ev)); ARRAY_FINI(X); ARRAY_FINI(Y); @@ -528,7 +503,6 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, gpuarray_blas_ops clblast_ops = { setup, teardown, - error, hdot, sdot, ddot, @@ -544,10 +518,10 @@ gpuarray_blas_ops clblast_ops = { hgemmBatch, sgemmBatch, dgemmBatch, - hgemvBatch, /* TODO */ - sgemvBatch, /* TODO */ - dgemvBatch, /* TODO */ - hgerBatch, /* TODO */ - sgerBatch, /* TODO */ - dgerBatch, /* TODO */ + NULL, /* hgemvBatch */ + NULL, /* sgemvBatch */ + NULL, /* dgemvBatch */ + NULL, /* hgerBatch */ + NULL, /* sgerBatch */ + NULL, /* dgerBatch */ }; diff --git a/src/loaders/libclblast.fn b/src/loaders/libclblast.fn index 28f36ba20b..2eb029937b 100644 --- a/src/loaders/libclblast.fn +++ b/src/loaders/libclblast.fn @@ -1,12 +1,12 @@ -DEF_PROC(StatusCode, CLBlastHdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); -DEF_PROC(StatusCode, CLBlastSdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); -DEF_PROC(StatusCode, CLBlastDdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); -DEF_PROC(StatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, CLBlastHgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_half beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, CLBlastSgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, CLBlastDgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *events)); -DEF_PROC(StatusCode, CLBlastHger, (Layout order, size_t M, size_t N, cl_half alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, CLBlastSger, (Layout order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); -DEF_PROC(StatusCode, CLBlastDger, (Layout order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastHdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(CLBlastStatusCode, CLBlastSdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(CLBlastStatusCode, CLBlastDdot, (const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event)); +DEF_PROC(CLBlastStatusCode, CLBlastHgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_half beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastSgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastDgemm, (Layout order, Transpose transA, Transpose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastHgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_half alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_half beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastSgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastDgemv, (Layout order, Transpose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_command_queue *queue, cl_event *events)); +DEF_PROC(CLBlastStatusCode, CLBlastHger, (Layout order, size_t M, size_t N, cl_half alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastSger, (Layout order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); +DEF_PROC(CLBlastStatusCode, CLBlastDger, (Layout order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_command_queue *queue, cl_event *event)); diff --git a/src/loaders/libclblast.h b/src/loaders/libclblast.h index 9b60d1e6a8..0608250631 100644 --- a/src/loaders/libclblast.h +++ b/src/loaders/libclblast.h @@ -15,10 +15,36 @@ typedef enum Transpose_ { kConjugate = 113 } Transpose; -typedef enum StatusCode_ { +typedef enum CLBLastStatusCode_ { kSuccess = 0, /* Rest is not exposed from here */ -} StatusCode; + CLBlastNotImplemented = -1024, + CLBlastInvalidMatrixA = -1022, + CLBlastInvalidMatrixB = -1021, + CLBlastInvalidMatrixC = -1020, + CLBlastInvalidVectorX = -1019, + CLBlastInvalidVectorY = -1018, + CLBlastInvalidDimension = -1017, + CLBlastInvalidLeadDimA = -1016, + CLBlastInvalidLeadDimB = -1015, + CLBlastInvalidLeadDimC = -1014, + CLBlastInvalidIncrementX = -1013, + CLBlastInvalidIncrementY = -1012, + CLBlastInsufficientMemoryA = -1011, + CLBlastInsufficientMemoryB = -1010, + CLBlastInsufficientMemoryC = -1009, + CLBlastInsufficientMemoryX = -1008, + CLBlastInsufficientMemoryY = -1007, + + CLBlastInvalidLocalMemUsage = -2046, + CLBlastNoHalfPrecision = -2045, + CLBlastNoDoublePrecision = -2044, + CLBlastInvalidVectorScalar = -2043, + CLBlastInsufficientMemoryScalar = -2042, + CLBlastDatabaseError = -2041, + CLBlastUnknownError = -2040, + CLBlastUnexpectedError = -2039, +} CLBlastStatusCode; int load_libclblast(error *); From b945b31d985c80a8fc770725f39ce5c291e5b76a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 16:20:20 -0400 Subject: [PATCH 261/597] Fix the last few compile mishaps. --- src/loaders/dyn_load.c | 2 +- src/loaders/dyn_load.h | 6 ++++-- src/loaders/libcublas.c | 2 +- src/loaders/libcuda.c | 4 ++-- src/util/error.c | 5 +++-- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index a8fbd22ea1..6fedaa8520 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -16,7 +16,7 @@ void *ga_load_library(const char *name, error *e) { return res; } -void *ga_func_ptr(void *h, const char *name) { +void *ga_func_ptr(void *h, const char *name, error *e) { void *res = dlsym(h, name); if (res == NULL) error_fmt(e, GA_LOAD_ERROR, "Could not find synbol \"%s\": %s", name, dlerror()); diff --git a/src/loaders/dyn_load.h b/src/loaders/dyn_load.h index 73fea5d69f..bc62ebf2a6 100644 --- a/src/loaders/dyn_load.h +++ b/src/loaders/dyn_load.h @@ -1,7 +1,9 @@ #ifndef UTIL_DYN_LOAD_H #define UTIL_DYN_LOAD_H -void *ga_load_library(const char *name); -void *ga_func_ptr(void *h, const char *name); +#include "util/error.h" + +void *ga_load_library(const char *name, error *e); +void *ga_func_ptr(void *h, const char *name, error *e); #endif diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index dc4b253ea0..57c8c0e295 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -23,7 +23,7 @@ } #define DEF_PROC_OPT(name, args) \ - name = (t##name *)ga_func_ptr(lib, #name); + name = (t##name *)ga_func_ptr(lib, #name, e); #define DEF_PROC_V2(name, args) \ name = (t##name *)ga_func_ptr(lib, STRINGIFY(name##_v2), e); \ diff --git a/src/loaders/libcuda.c b/src/loaders/libcuda.c index 9f49ad9bc0..729c832a62 100644 --- a/src/loaders/libcuda.c +++ b/src/loaders/libcuda.c @@ -47,9 +47,9 @@ int load_libcuda(error *e) { if (loaded) return GA_NO_ERROR; - lib = ga_load_library(libname); + lib = ga_load_library(libname, e); if (lib == NULL) - return error_fmt(e, GA_LOAD_ERROR, "Could not load \"%s\"", libname); + return e->code; #include "libcuda.fn" diff --git a/src/util/error.c b/src/util/error.c index 263768fa30..f7fe244293 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -1,4 +1,5 @@ #include +#include #include #include "error.h" @@ -18,14 +19,14 @@ void error_free(error *e) { free(e); } -int error_setall(error *e, int code, const char *msg) { +int error_set(error *e, int code, const char *msg) { e->code = code; strlcpy(e->msg, msg, ERROR_MSGBUF_LEN); return code; } int error_fmt(error *e, int code, const char *fmt, ...) { - va_arg ap; + va_list ap; e->code = code; va_start(ap, fmt); From bf560d0cd369c2ccd06e0084ccb13ff8af415259 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 16:28:24 -0400 Subject: [PATCH 262/597] Initialize error storage for opencl. --- src/gpuarray_buffer_opencl.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 618f2a5888..13adaf7a44 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -148,7 +148,12 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { res->ctx = ctx; res->ops = &opencl_ops; - res->err = CL_SUCCESS; + if (error_alloc(&res->err)) { + error_set(global_err, GA_SYS_ERROR, "Could not create error context"); + free(res); + return NULL; + } + res->refcnt = 1; res->exts = NULL; res->blas_handle = NULL; @@ -158,8 +163,9 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { ISSET(flags, GA_CTX_SINGLE_STREAM) ? 0 : qprop&CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); if (res->q == NULL) { - free(res); error_cl(global_err, "clCreateCommandQueue", err); + error_free(res->err); + free(res); return NULL; } @@ -221,6 +227,7 @@ static void cl_free_ctx(cl_ctx *ctx) { clReleaseContext(ctx->ctx); if (ctx->preamble != NULL) free(ctx->preamble); + error_free(ctx->err); CLEAR(ctx); free(ctx); } From 6332595d8967786b7ede9cac8556a450d82b8624 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 16:34:15 -0400 Subject: [PATCH 263/597] Make sure to actually pass along messages. --- src/gpuarray_buffer.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index e070f29fc0..b457fb3dcc 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -67,17 +67,10 @@ int gpucontext_property(gpucontext *ctx, int prop_id, void *res) { } const char *gpucontext_error(gpucontext *ctx, int err) { - if (ctx != NULL) { - switch (err) { - case GA_IMPL_ERROR: - return ctx->ops->ctx_error(ctx); - case GA_BLAS_ERROR: - return gpublas_error(ctx); - case GA_COMM_ERROR: - return gpucomm_error(ctx); - } - } - return gpuarray_error_str(err); + if (ctx == NULL) + return global_err->msg; + else + return ctx->ops->ctx_error(ctx); } gpudata *gpudata_alloc(gpucontext *ctx, size_t sz, void *data, int flags, From 8866d95ab7b03fd12f3783d3554bfe64ec018a74 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 24 Mar 2017 17:02:38 -0400 Subject: [PATCH 264/597] Trailing mistakes. --- src/gpuarray_buffer_cuda.c | 2 +- src/gpuarray_buffer_opencl.c | 4 ++-- src/util/error.c | 6 ++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index b5c9599a2f..6522897cc2 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1152,7 +1152,7 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), cujit_opts, cujit_opt_vals, &st); - if (ctx->err != CUDA_SUCCESS) + if (err != CUDA_SUCCESS) return error_cuda(ctx->err, "cuLinkCreate", err); err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, "kernel code", 0, NULL, NULL); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 13adaf7a44..812d796920 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -896,7 +896,7 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, } p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &err); - if (ctx->err != CL_SUCCESS) { + if (err != CL_SUCCESS) { if (n != 0) { free(news); free(newl); @@ -1135,7 +1135,7 @@ static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) { if (res == NULL) return error_sys(ctx->err, "malloc"); err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); - if (ctx->err != CL_SUCCESS) { + if (err != CL_SUCCESS) { free(res); return error_cl(ctx->err, "clProgramGetInfo", err); } diff --git a/src/util/error.c b/src/util/error.c index f7fe244293..fdeb3a3a68 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -22,6 +22,9 @@ void error_free(error *e) { int error_set(error *e, int code, const char *msg) { e->code = code; strlcpy(e->msg, msg, ERROR_MSGBUF_LEN); +#ifdef DEBUG + fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg); +#endif return code; } @@ -32,5 +35,8 @@ int error_fmt(error *e, int code, const char *fmt, ...) { va_start(ap, fmt); vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap); va_end(ap); +#ifdef DEBUG + fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg); +#endif return code; } From 445258580d98b53b136ba3a611e4c0c57a12fd5f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 10 Apr 2017 15:06:52 -0400 Subject: [PATCH 265/597] Add missing include. --- src/util/error.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util/error.c b/src/util/error.c index fdeb3a3a68..420a9a6924 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -2,7 +2,8 @@ #include #include -#include "error.h" +#include "private_config.h" +#include "util/error.h" static error _global_err = {}; error *global_err = &_global_err; From 6c5611e0b56ed4fbf7ab666bc1fc1e3257ceded2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 10 Apr 2017 16:30:28 -0400 Subject: [PATCH 266/597] Add missing brackets around error condition. --- src/gpuarray_buffer_cuda.c | 90 +++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 6522897cc2..db3dce7345 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -235,9 +235,10 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (cache_hash_fn)strb_hash, (cache_freek_fn)strb_free, (cache_freev_fn)cuda_freekernel, global_err); - if (res->kernel_cache == NULL) + if (res->kernel_cache == NULL) { error_cuda(global_err, "cuStreamCreate", err); goto fail_cache; + } cache_path = getenv("GPUARRAY_CACHE_PATH"); if (cache_path != NULL) { @@ -246,10 +247,10 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (cache_hash_fn)key_hash, (cache_freek_fn)key_free, (cache_freev_fn)strb_free, - res->err); + global_err); if (mem_cache == NULL) { fprintf(stderr, "Error initializing mem cache for disk: %s\n", - res->err->msg); + global_err->msg); goto fail_disk_cache; } res->disk_cache = cache_disk(cache_path, mem_cache, @@ -489,50 +490,51 @@ static const char CUDA_PREAMBLE[] = /* XXX: add vector types */ static cuda_context *do_init(CUdevice dev, int flags, error *e) { - cuda_context *res; - CUcontext ctx; - CUresult err; - unsigned int fl = CU_CTX_SCHED_AUTO; - unsigned int cur_fl; - int act; - int i; - - if (flags & GA_CTX_SINGLE_THREAD) - fl = CU_CTX_SCHED_SPIN; - if (flags & GA_CTX_MULTI_THREAD) - fl = CU_CTX_SCHED_YIELD; - err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); - CHKFAIL(e, "cuDeviceGetAttribute", NULL); - if (i != 1) { - error_set(e, GA_UNSUPPORTED_ERROR, "device does not support unified addressing"); - return NULL; - } - err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act); - CHKFAIL(e, "cuDevicePrimaryCtxGetState", NULL); - if (act == 1) { - if ((cur_fl & fl) != fl) { - error_set(e, GA_INVALID_ERROR, "device is already active and has unsupported flags"); - return NULL; - } - } else { - err = cuDevicePrimaryCtxSetFlags(dev, fl); - CHKFAIL(e, "cuDevicePrimaryCtxSetFlags", NULL); - } - err = cuDevicePrimaryCtxRetain(&ctx, dev); - CHKFAIL(e, "cuDevicePrimaryCtxRetain", NULL); - err = cuCtxPushCurrent(ctx); - CHKFAIL(e, "cuCtxPushCurrent", NULL); - res = cuda_make_ctx(ctx, flags); - if (res == NULL) { - cuDevicePrimaryCtxRelease(dev); - if (e != global_err) - error_set(e, global_err->code, global_err->msg); + cuda_context *res; + CUcontext ctx; + CUresult err; + unsigned int fl = CU_CTX_SCHED_AUTO; + unsigned int cur_fl; + int act; + int i; + + if (flags & GA_CTX_SINGLE_THREAD) + fl = CU_CTX_SCHED_SPIN; + if (flags & GA_CTX_MULTI_THREAD) + fl = CU_CTX_SCHED_YIELD; + err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); + CHKFAIL(e, "cuDeviceGetAttribute", NULL); + if (i != 1) { + error_set(e, GA_UNSUPPORTED_ERROR, "device does not support unified addressing"); + return NULL; + } + err = cuDevicePrimaryCtxGetState(dev, &cur_fl, &act); + CHKFAIL(e, "cuDevicePrimaryCtxGetState", NULL); + if (act == 1) { + if ((cur_fl & fl) != fl) { + error_set(e, GA_INVALID_ERROR, "device is already active and has unsupported flags"); return NULL; } - /* Don't leave the context on the thread stack */ - cuCtxPopCurrent(NULL); + } else { + err = cuDevicePrimaryCtxSetFlags(dev, fl); + CHKFAIL(e, "cuDevicePrimaryCtxSetFlags", NULL); + } + err = cuDevicePrimaryCtxRetain(&ctx, dev); + CHKFAIL(e, "cuDevicePrimaryCtxRetain", NULL); + err = cuCtxPushCurrent(ctx); + CHKFAIL(e, "cuCtxPushCurrent", NULL); + res = cuda_make_ctx(ctx, flags); + if (res == NULL) { + fprintf(stderr, "res failed\n"); + cuDevicePrimaryCtxRelease(dev); + if (e != global_err) + error_set(e, global_err->code, global_err->msg); + return NULL; + } + /* Don't leave the context on the thread stack */ + cuCtxPopCurrent(NULL); - return res; + return res; } static gpucontext *cuda_init(int ord, int flags) { From 551eca7098de8564c814a6127ae7bb39ab94390f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 11 Apr 2017 12:07:09 -0400 Subject: [PATCH 267/597] Fix 32 bit compilation. --- src/private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/private.h b/src/private.h index 820ebb6287..69ce31404f 100644 --- a/src/private.h +++ b/src/private.h @@ -55,7 +55,7 @@ typedef struct _gpuarray_comm_ops gpuarray_comm_ops; struct _gpucontext { GPUCONTEXT_HEAD; void *ctx_ptr; - void *private[7]; + void *private[11]; }; /* The real gpudata struct is likely bigger but we only care about the From 211a73c414d40669fc4fcce018ae61dd37fd8f75 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 11 Apr 2017 12:08:49 -0400 Subject: [PATCH 268/597] Changes for release 0.6.4 --- doc/conf.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 00083fdc09..cee40b416e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -51,7 +51,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.3' +release = '0.6.4' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 20f0c95921..d05fa2da9f 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ MAJOR = 0 MINOR = 6 -PATCH = 3 +PATCH = 4 SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) From 7322a715e011c43bfaaf876a3bb8056e2ec40a3a Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Tue, 18 Apr 2017 18:59:04 -0400 Subject: [PATCH 269/597] Fix length of ptx when compilation succeeded --- src/gpuarray_buffer_cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index db3dce7345..466d627a32 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1116,10 +1116,10 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { if (strb_ensure(ptx, buflen) == 0) { err = nvrtcGetPTX(prog, ptx->s+ptx->l); if (err != NVRTC_SUCCESS) { - ptx->l += buflen; nvrtcDestroyProgram(&prog); return error_nvrtc(ctx->err, "nvrtcGetPTX", err); } + ptx->l += buflen; } return GA_NO_ERROR; From 024e4e1e8efdd0cf9c93946a7e171682f33aad75 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Tue, 18 Apr 2017 18:59:55 -0400 Subject: [PATCH 270/597] Update tests where needed --- tests/check_buffer.c | 10 ---------- tests/check_buffer_collectives.c | 10 +++++----- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/tests/check_buffer.c b/tests/check_buffer.c index b091a718d6..859b1f0568 100644 --- a/tests/check_buffer.c +++ b/tests/check_buffer.c @@ -10,15 +10,6 @@ extern void *ctx; void setup(void); void teardown(void); -START_TEST(test_gpu_error) { - const char *msg; - msg = gpucontext_error(NULL, -1); - msg = gpucontext_error(NULL, 99); - msg = gpucontext_error(NULL, GA_NO_ERROR); - ck_assert_str_eq(msg, "No error"); -} -END_TEST - static unsigned int refcnt(gpudata *b) { unsigned int res; int err; @@ -189,7 +180,6 @@ Suite *get_suite(void) { Suite *s = suite_create("buffer"); TCase *tc = tcase_create("API"); tcase_add_checked_fixture(tc, setup, teardown); - tcase_add_test(tc, test_gpu_error); tcase_add_test(tc, test_buffer_alloc); tcase_add_test(tc, test_buffer_retain_release); tcase_add_test(tc, test_buffer_share); diff --git a/tests/check_buffer_collectives.c b/tests/check_buffer_collectives.c index 806e2e724e..76f4e1bdb3 100644 --- a/tests/check_buffer_collectives.c +++ b/tests/check_buffer_collectives.c @@ -193,7 +193,7 @@ TEST_REDUCE_FAIL(optype, SIZE / sizeof(int), GA_INT, -1, 0, GA_INVALID_ERROR) TEST_REDUCE_FAIL(src_offset, SIZE / sizeof(int), GA_INT, GA_SUM, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, - GA_UNSUPPORTED_ERROR) + GA_XLARGE_ERROR) #define TEST_ALL_REDUCE(systype, gatype, mpitype, coloptype, epsilon, print) \ START_TEST(test_gpucomm_all_reduce_##gatype##_##coloptype) { \ @@ -289,7 +289,7 @@ TEST_ALL_REDUCE_FAIL(src_offset, SIZE / sizeof(int), GA_INT, GA_SUM, TEST_ALL_REDUCE_FAIL(dest_offset, SIZE / sizeof(int), GA_INT, GA_SUM, 0, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_ALL_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, - GA_UNSUPPORTED_ERROR) + GA_XLARGE_ERROR) #define TEST_REDUCE_SCATTER(systype, gatype, mpitype, coloptype, epsilon, \ print) \ @@ -392,7 +392,7 @@ TEST_REDUCE_SCATTER_FAIL(src_offset, outcount, GA_INT, GA_SUM, TEST_REDUCE_SCATTER_FAIL(dest_offset, outcount, GA_INT, GA_SUM, 0, SIZE / comm_ndev - sizeof(int), GA_VALUE_ERROR) TEST_REDUCE_SCATTER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, - GA_UNSUPPORTED_ERROR) + GA_XLARGE_ERROR) #define TEST_BROADCAST(systype, gatype, mpitype, epsilon, print) \ START_TEST(test_gpucomm_broadcast_##gatype) { \ @@ -459,7 +459,7 @@ TEST_BROADCAST_FAIL(datatype, SIZE / sizeof(int), -1, 0, GA_INVALID_ERROR) TEST_BROADCAST_FAIL(src_offset, SIZE / sizeof(int), GA_INT, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_BROADCAST_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, - GA_UNSUPPORTED_ERROR) + GA_XLARGE_ERROR) #define TEST_ALL_GATHER(systype, gatype, mpitype, epsilon, print) \ START_TEST(test_gpucomm_all_gather_##gatype) { \ @@ -533,7 +533,7 @@ TEST_ALL_GATHER_FAIL(src_offset, incount, GA_INT, TEST_ALL_GATHER_FAIL(dest_offset, incount, GA_INT, 0, SIZE - sizeof(int), GA_VALUE_ERROR) TEST_ALL_GATHER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, 0, - GA_UNSUPPORTED_ERROR) + GA_XLARGE_ERROR) Suite* get_suite(void) { Suite* s = suite_create("buffer_collectives_API"); From 69c2c449b018e8088bc54d9810fa9d892dbcd11a Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Tue, 18 Apr 2017 20:18:28 -0400 Subject: [PATCH 271/597] Add back detection of binary case, so we recompile For Theano versions before the fix, so that we do not try to make nvrtc compile binary. --- src/gpuarray_buffer_cuda.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 466d627a32..a9013a711c 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1292,6 +1292,11 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, return NULL; } + if (flags & GA_USE_BINARY) { + error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Binary mode not supported any more"); + return NULL; + } + cuda_enter(ctx); err = cuCtxGetDevice(&dev); From 17b751aaeff7427c57f4a723d2632138b8feb217 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 19 Apr 2017 09:54:06 -0400 Subject: [PATCH 272/597] Have the right error message appear when there is no blas library --- src/gpuarray_buffer_blas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index eb087c707f..595ac8d5a4 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -4,7 +4,7 @@ int gpublas_setup(gpucontext *ctx) { if (ctx->blas_ops == NULL) - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Missing Blas library"); return ctx->blas_ops->setup(ctx); } From 4a0381c99da110e071a15905305cd57e59dee8a8 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 19 Apr 2017 09:54:21 -0400 Subject: [PATCH 273/597] Remove debuging line --- src/gpuarray_buffer_cuda.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index a9013a711c..7adfa9a40b 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -525,7 +525,6 @@ static cuda_context *do_init(CUdevice dev, int flags, error *e) { CHKFAIL(e, "cuCtxPushCurrent", NULL); res = cuda_make_ctx(ctx, flags); if (res == NULL) { - fprintf(stderr, "res failed\n"); cuDevicePrimaryCtxRelease(dev); if (e != global_err) error_set(e, global_err->code, global_err->msg); From bb248745f0ae91a3781dedd164fbc96d8c93d73e Mon Sep 17 00:00:00 2001 From: Christos Tsirigotis Date: Thu, 20 Apr 2017 18:23:06 +0300 Subject: [PATCH 274/597] Convert GA_FLOAT16 to ncclHalf in cuda collectives --- src/gpuarray_collectives_cuda_nccl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index dc5a901ab7..bd5cb3963b 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -175,11 +175,14 @@ static inline ncclDataType_t convert_data_type(int typecode) { switch (typecode) { case GA_BYTE: return ncclChar; case GA_INT: return ncclInt; - case GA_HALF: return ncclHalf; case GA_FLOAT: return ncclFloat; case GA_DOUBLE: return ncclDouble; case GA_LONG: return ncclInt64; case GA_ULONG: return ncclUint64; + #ifdef CUDA_HAS_HALF + case GA_HALF: return ncclHalf; + case GA_FLOAT16: return ncclHalf; + #endif } return nccl_NUM_TYPES; } From 0e361e738ff859806956cd976bb0f7cddcf24b27 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 20 Apr 2017 12:20:38 -0400 Subject: [PATCH 275/597] Try to fix compilation errors on Windows/Python 2.7/64 bits. --- .gitignore | 1 + src/cache/disk.c | 4 ++-- src/gpuarray_elemwise.c | 16 ++++++++-------- src/loaders/dyn_load.c | 6 +++--- src/util/error.c | 2 +- src/util/error.h | 7 +++++++ 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 164503e2ae..ec7eecef3f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ lib .*.sw[po] *~ *.pyc +*.pyd *.pyo *.egg-info MANIFEST diff --git a/src/cache/disk.c b/src/cache/disk.c index 7297dade4d..6fda751b30 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -335,7 +335,7 @@ static int find_entry(disk_cache *c, const cache_key_t key, b.l = vl; *_v = c->vread(&b); if (*_v == NULL) - goto error; + goto error_find_entry; } if (_k) *_k = k; @@ -345,7 +345,7 @@ static int find_entry(disk_cache *c, const cache_key_t key, strb_clear(&b); return 1; } - error: + error_find_entry: if (k) c->c.kfree(k); b.s = ts; diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 1d93e5a155..840c190473 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -387,11 +387,11 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, } err = GpuKernel_setarg(k, p++, &n); - if (err != GA_NO_ERROR) goto error; + if (err != GA_NO_ERROR) goto error_call_basic; for (i = 0; i < nd; i++) { err = GpuKernel_setarg(k, p++, &dims[i]); - if (err != GA_NO_ERROR) goto error; + if (err != GA_NO_ERROR) goto error_call_basic; } /* l is the number of arrays to date */ @@ -400,25 +400,25 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, if (is_array(ge->args[j])) { GpuArray *v = (GpuArray *)args[j]; err = GpuKernel_setarg(k, p++, v->data); - if (err != GA_NO_ERROR) goto error; + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k, p++, &v->offset); - if (err != GA_NO_ERROR) goto error; + if (err != GA_NO_ERROR) goto error_call_basic; for (i = 0; i < nd; i++) { err = GpuKernel_setarg(k, p++, &strs[l][i]); - if (err != GA_NO_ERROR) goto error; + if (err != GA_NO_ERROR) goto error_call_basic; } l++; } else { err = GpuKernel_setarg(k, p++, args[j]); - if (err != GA_NO_ERROR) goto error; + if (err != GA_NO_ERROR) goto error_call_basic; } } err = GpuKernel_sched(k, n, &gs, &ls); - if (err != GA_NO_ERROR) goto error; + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL); - error: + error_call_basic: return err; } diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index 6fedaa8520..2ea2f331d7 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -28,7 +28,7 @@ void *ga_func_ptr(void *h, const char *name, error *e) { /* Should be windows */ #include -static inline void error_win(error *e) { +static inline void error_win(const char* name, error *e) { char msgbuf[512]; DWORD err = GetLastError(); DWORD len = FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM| @@ -43,14 +43,14 @@ static inline void error_win(error *e) { void *ga_load_library(const char *name, error *e) { void *res = LoadLibrary(name); if (res == NULL) - error_win(e); + error_win(name, e); return res; } void *ga_func_ptr(void *h, const char *name, error *e) { void *res = (void *)GetProcAddress(h, name); if (res == NULL) - error_win(e); + error_win(name, e); return res; } diff --git a/src/util/error.c b/src/util/error.c index 420a9a6924..19ce184363 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -5,7 +5,7 @@ #include "private_config.h" #include "util/error.h" -static error _global_err = {}; +static error _global_err = {{0}, 0}; error *global_err = &_global_err; int error_alloc(error **_e) { diff --git a/src/util/error.h b/src/util/error.h index fc1ecb1663..b7a50fc6a8 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -6,6 +6,13 @@ #include +/* MSVC 2008 does not support "inline". */ +#ifdef _MSC_VER +#ifndef inline +#define inline __inline +#endif +#endif + /* 1024 - 4 for the int that goes after */ #define ERROR_MSGBUF_LEN 1020 From 7e62305c7969bcdbafdddce104a4a3e1fc8b156d Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 20 Apr 2017 14:33:02 -0400 Subject: [PATCH 276/597] Tell that people can use conda to install libgpuarray and pygpu. --- doc/installation.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/installation.rst b/doc/installation.rst index a9e0bb5ffd..e348ed0fa2 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -13,6 +13,17 @@ It should also work on any decently recent OS not listed here. If you get an error during the build on your favorite OS, please report it and we will attempt to fix it. +Conda +----- + +The easiest way to install libgpuarray is with conda:: + + conda install pygpu + +This will also install the libgpuarray package automatically. + +This should work on Linux, Mac OS and Windows. + Build Requirements ------------------ From 9328590222b2f2760f51f561a9e9d294cfc81a5e Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 20 Apr 2017 16:37:24 -0400 Subject: [PATCH 277/597] Changes for release 0.6.5 --- doc/conf.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index cee40b416e..0ff1a1b1a8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -51,7 +51,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.4' +release = '0.6.5' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index d05fa2da9f..dd059fcb16 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ MAJOR = 0 MINOR = 6 -PATCH = 4 +PATCH = 5 SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) From aedaf8592ba9001d2eed5454d54cb14acd81e682 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 20 Apr 2017 17:00:02 -0400 Subject: [PATCH 278/597] update the how to release notes --- release.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/release.txt b/release.txt index 7daf509762..72e850cccc 100644 --- a/release.txt +++ b/release.txt @@ -3,6 +3,11 @@ Release process: - Update the version in setup.py - Update the version in doc/conf.py - Commit the changes with message "Changes for release X.Y.Z" -- Push to master -- Add a release on github with a tag in the form of 'vX.Y.X' + git commit -m "Changes for release X.Y.Z" +- Make a git tag + git tag vX.Y.Z +- Push to master the commit and the tag + git push --tags central master +- Add a release on github with a tag in the form of 'vX.Y.Z' + https://github.com/Theano/libgpuarray/releases/new - Make note of the major changes since the last release From 8214595b24a22959ecdaddefac0d05fb7c9bdce0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 15:59:43 -0500 Subject: [PATCH 279/597] Remove deprecated options and set EXTRACT_ALL so that we can have links in XML. --- doc/Doxyfile | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/doc/Doxyfile b/doc/Doxyfile index 33e1ad541a..06b6dd84d4 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -329,22 +329,6 @@ INLINE_SIMPLE_STRUCTS = NO TYPEDEF_HIDES_STRUCT = YES -# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to -# determine which symbols to keep in memory and which to flush to disk. -# When the cache is full, less often used symbols will be written to disk. -# For small to medium size projects (<1000 input files) the default value is -# probably good enough. For larger projects a too small cache size can cause -# doxygen to be busy swapping symbols to and from disk most of the time -# causing a significant performance penalty. -# If the system has enough physical memory increasing the cache will improve the -# performance by keeping more symbols in memory. Note that the value works on -# a logarithmic scale so increasing the size by one will roughly double the -# memory usage. The cache size is given by this formula: -# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols. - -SYMBOL_CACHE_SIZE = 0 - # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given # their name and scope. Since this can be an expensive process and often the @@ -365,7 +349,7 @@ LOOKUP_CACHE_SIZE = 0 # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES -EXTRACT_ALL = NO +EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. @@ -1471,18 +1455,6 @@ GENERATE_XML = YES XML_OUTPUT = xml -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that From f1c4e0eece009794d0dd2f7182dacc5f6b3af434 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 16:11:29 -0500 Subject: [PATCH 280/597] Make documentation formatting in buffer_collectives.h less fugly. --- src/gpuarray/array.h | 2 +- src/gpuarray/buffer.h | 2 + src/gpuarray/buffer_collectives.h | 221 ++++++++++++++++-------------- 3 files changed, 120 insertions(+), 105 deletions(-) diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index 271b8a1d7a..a7aa899559 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -537,7 +537,7 @@ GPUARRAY_PUBLIC int GpuArray_copy(GpuArray *res, const GpuArray *a, * Source and target arrays must be contiguous. This restriction may * be lifted in the future. * - * \param r result array + * \param res result array * \param a array to transfer * * \return GA_NO_ERROR if the operation was succesful. diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 0a4c921808..df00096cd1 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -422,6 +422,8 @@ GPUARRAY_PUBLIC gpucontext *gpudata_context(gpudata *b); * \param strings table of string pointers * \param lengths (optional) length for each string in the table * \param fname name of the kernel function (as defined in the code) + * \param numargs number of kernel arguments + * \param typecodes the type of each argument * \param flags flags for compilation (see #ga_usefl) * \param ret error return pointer * \param err_str returns pointer to debug message from GPU backend diff --git a/src/gpuarray/buffer_collectives.h b/src/gpuarray/buffer_collectives.h index a7b10b3d19..5b0de2e405 100644 --- a/src/gpuarray/buffer_collectives.h +++ b/src/gpuarray/buffer_collectives.h @@ -11,14 +11,12 @@ extern "C" { } #endif // CONFUSE_EMACS -/******************************************************************************* -* Multi-gpu collectives buffer interface * -*******************************************************************************/ +/***************************************************************************** +* Multi-gpu collectives buffer interface * +******************************************************************************/ /** * Multi-gpu communicator structure. - * - * \note The contents are private. */ struct _gpucomm; @@ -45,95 +43,108 @@ typedef struct _gpucommCliqueId { /** * \brief Create a new gpu communicator instance. - * \param comm [gpucomm**] pointer to get a new gpu communicator - * \param ctx [gpucontext*] gpu context in which `comm` will be used (contains - * device - * information) - * \param comm_id [gpucommCliqueId] id unique to communicators consisting a - * world - * \param ndev [int] number of communicators/devices participating in the world - * \param rank [int] user-defined rank, from 0 to `ndev`-1, of `comm` in the + * + * \param comm pointer to get a new gpu communicator + * \param ctx gpu context in which `comm` will be used (contains + * device information) + * \param comm_id id unique to communicators consisting a world + * \param ndev number of communicators/devices participating in the world + * \param rank user-defined rank, from 0 to `ndev`-1, of `comm` in the * world - * \note `rank` is defined to be unique for each new `comm` participating in the - * same - * world. - * \note Must be called in parallel by all separate new `comm`, which will - * consist a - * new world (failing will lead to deadlock). + * + * \note `rank` is defined to be unique for each new `comm` + * participating in the same world. + * + * \note Must be called in parallel by all separate new `comm`, which + * will consist a new world (failing will lead to deadlock). + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, int ndev, int rank); /** - * \brief Destroy a gpu communicator instance. - * \param comm [gpucomm*] gpu communicator to be destroyed - * \return void + * Destroy a gpu communicator instance. + * + * \param comm gpu communicator to be destroyed */ GPUARRAY_PUBLIC void gpucomm_free(gpucomm* comm); /** - * \brief Returns nice error message concerning \ref GA_COMM_ERROR. - * \param ctx [gpucontext*] gpu context in which communicator was used + * Returns nice error message concerning \ref GA_COMM_ERROR. + * + * \param ctx gpu context in which communicator was used + * * \return const char* useful backend error message */ GPUARRAY_PUBLIC const char* gpucomm_error(gpucontext* ctx); /** - * \brief Returns gpu context in which `comm` is used. - * \param comm [gpucomm*] gpu communicator + * Returns gpu context in which `comm` is used. + * + * \param comm gpu communicator + * * \return gpucontext* gpu context */ GPUARRAY_PUBLIC gpucontext* gpucomm_context(gpucomm* comm); /** - * \brief Creates a unique `comm_id` to be shared in a world of communicators. - * \param ctx [gpucontext*] gpu context - * \param comm_id [gpucommCliqueId*] pointer to instance containing id + * Creates a unique `comm_id` to be shared in a world of communicators. + * + * \param ctx gpu context + * \param comm_id pointer to instance containing id + * * \note Id is guaranteed to be unique across callers in a single host. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_gen_clique_id(gpucontext* ctx, gpucommCliqueId* comm_id); /** - * \brief Returns total number of device/communicators participating in `comm`'s - * world. - * \param comm [gpucomm*] gpu communicator - * \param gpucount [int*] pointer to number of gpus in `comm`'s world + * Returns total number of device/communicators participating in + * `comm`'s world. + * + * \param comm gpu communicator + * \param gpucount pointer to number of gpus in `comm`'s world + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_get_count(gpucomm* comm, int* gpucount); /** - * \brief Returns rank of `comm` inside its world as defined by user upon + * Returns rank of `comm` inside its world as defined by user upon * creation. - * \param comm [gpucomm*] gpu communicator - * \param rank [int*] pointer to `comm`'s rank + * + * \param comm gpu communicator + * \param rank pointer to `comm`'s rank + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_get_rank(gpucomm* comm, int* rank); /** - * \brief Reduce collective operation for ranks in a communicator world [buffer - * level]. - * \param src [gpudata*] data in device's buffer to be reduced - * \param offsrc [size_t] memory offset after which data is saved in buffer + * Reduce collective operation for ranks in a communicator world + * [buffer level]. + * + * \param src data in device's buffer to be reduced + * \param offsrc memory offset after which data is saved in buffer * `src` - * \param dest [gpudata*] data in device's buffer to collect result - * \param offdest [size_t] memory offset after which data will be saved in - * buffer - * `dest` - * \param count [size_t] number of elements to be reduced in each array - * \param typecode [int] code for elements' data type, see \ref enum + * \param dest data in device's buffer to collect result + * \param offdest memory offset after which data will be saved in + * buffer `dest` + * \param count number of elements to be reduced in each array + * \param typecode code for elements' data type, see \ref enum * GPUARRAY_TYPES - * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops - * \param root [int] rank in `comm` which will collect result - * \param comm [gpucomm*] gpu communicator - * \note Non root ranks can call this, using a NULL `dest`. In this case, - * `offdest` - * will not be used. + * \param opcode reduce operation code, see \ref enum _gpucomm_reduce_ops + * \param root rank in `comm` which will collect result + * \param comm gpu communicator + * + * \note Non root ranks can call this, using a NULL `dest`. In this + * case, `offdest` will not be used. + * * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, @@ -141,27 +152,27 @@ GPUARRAY_PUBLIC int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, int opcode, int root, gpucomm* comm); /** - * \brief AllReduce collective operation for ranks in a communicator world - * [buffer - * level]. + * AllReduce collective operation for ranks in a communicator world + * [buffer level]. * - * Reduces data pointed by `src` using op operation and leaves identical copies - * of - * result in data pointed by `dest` on each rank of `comm`. + * Reduces data pointed by `src` using op operation and leaves + * identical copies of result in data pointed by `dest` on each rank + * of `comm`. * - * \param src [gpudata*] data in device's buffer to be reduced - * \param offsrc [size_t] memory offset after which data is saved in buffer + * \param src data in device's buffer to be reduced + * \param offsrc memory offset after which data is saved in buffer * `src` - * \param dest [gpudata*] data in device's buffer to collect result - * \param offdest [size_t] memory offset after which data will be saved in - * buffer - * `dest` - * \param count [size_t] number of elements to be reduced in each array - * \param typecode [int] code for elements' data type, see \ref enum + * \param dest data in device's buffer to collect result + * \param offdest memory offset after which data will be saved in + * buffer `dest` + * \param count number of elements to be reduced in each array + * \param typecode code for elements' data type, see \ref enum * GPUARRAY_TYPES - * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops - * \param comm [gpucomm*] gpu communicator + * \param opcode reduce operation code, see \ref enum _gpucomm_reduce_ops + * \param comm gpu communicator + * * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc, @@ -170,27 +181,27 @@ GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc, gpucomm* comm); /** - * \brief ReduceScatter collective operation for ranks in a communicator world - * [buffer level]. + * ReduceScatter collective operation for ranks in a communicator + * world [buffer level]. * - * Reduces data pointed by `src` using `opcode` operation and leaves reduced - * result - * scattered over data pointed by `dest` in the user-defined rank order in - * `comm`. + * Reduces data pointed by `src` using `opcode` operation and leaves + * reduced result scattered over data pointed by `dest` in the + * user-defined rank order in `comm`. * - * \param src [gpudata*] data in device's buffer to be reduced - * \param offsrc [size_t] memory offset after which data is saved in buffer + * \param src data in device's buffer to be reduced + * \param offsrc memory offset after which data is saved in buffer * `src` - * \param dest [gpudata*] data in device's buffer to collect scattered result - * \param offdest [size_t] memory offset after which data will be saved in - * buffer - * `dest` - * \param count [size_t] number of elements to be contained in result `dest` - * \param typecode [int] code for elements' data type, see \ref enum + * \param dest data in device's buffer to collect scattered result + * \param offdest memory offset after which data will be saved in + * buffer `dest` + * \param count number of elements to be contained in result `dest` + * \param typecode code for elements' data type, see \ref enum * GPUARRAY_TYPES - * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops - * \param comm [gpucomm*] gpu communicator + * \param opcode reduce operation code, see \ref enum _gpucomm_reduce_ops + * \param comm gpu communicator + * * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, @@ -199,20 +210,21 @@ GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, int opcode, gpucomm* comm); /** - * \brief Broadcast collective operation for ranks in a communicator world - * [buffer - * level]. + * Broadcast collective operation for ranks in a communicator world + * [buffer level]. * * Copies data pointed by `array` to all ranks in `comm`. * - * \param array [gpudata*] data in device's buffer to get copied or be received - * \param offset [size_t] memory offset after which data in `array` begin - * \param count [size_t] number of elements to be contained in `array` - * \param typecode [int] code for elements' data type, see \ref enum + * \param array data in device's buffer to get copied or be received + * \param offset memory offset after which data in `array` begin + * \param count number of elements to be contained in `array` + * \param typecode code for elements' data type, see \ref enum * GPUARRAY_TYPES - * \param root [int] rank in `comm` which broadcasts its array - * \param comm [gpucomm*] gpu communicator + * \param root rank in `comm` which broadcasts its array + * \param comm gpu communicator + * * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_broadcast(gpudata* array, size_t offset, @@ -220,22 +232,23 @@ GPUARRAY_PUBLIC int gpucomm_broadcast(gpudata* array, size_t offset, gpucomm* comm); /** - * \brief AllGather collective operation for ranks in a communicator world. + * AllGather collective operation for ranks in a communicator world. * * Each rank receives all data pointed by `src` of every rank in the - * user-defined - * rank order in `comm`. - * - * \param src [gpudata*] data in device's buffer to be gathered - * \param offsrc [size_t] memory offset after which data in `src` begin - * \param dest [gpudata*] data in device's buffer to gather from all ranks - * \param offdest [size_t] memory offset after which data in `dest` begin - * \param count [size_t] number of elements to be gathered from each rank in + * user-defined rank order in `comm`. + * + * \param src data in device's buffer to be gathered + * \param offsrc memory offset after which data in `src` begin + * \param dest data in device's buffer to gather from all ranks + * \param offdest memory offset after which data in `dest` begin + * \param count number of elements to be gathered from each rank in * `src` - * \param typecode [int] code for elements' data type, see \ref enum + * \param typecode code for elements' data type, see \ref enum * GPUARRAY_TYPES - * \param comm [gpucomm*] gpu communicator + * \param comm gpu communicator + * * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_all_gather(gpudata* src, size_t offsrc, From b4f748a7a6c4dbe8293e3ead9c787cdafd376399 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 16:17:16 -0500 Subject: [PATCH 281/597] Make doc formatting less fugly in collectives.h --- src/gpuarray/buffer_collectives.h | 21 +++--- src/gpuarray/collectives.h | 104 +++++++++++++++++------------- 2 files changed, 66 insertions(+), 59 deletions(-) diff --git a/src/gpuarray/buffer_collectives.h b/src/gpuarray/buffer_collectives.h index 5b0de2e405..ff825a3209 100644 --- a/src/gpuarray/buffer_collectives.h +++ b/src/gpuarray/buffer_collectives.h @@ -134,9 +134,8 @@ GPUARRAY_PUBLIC int gpucomm_get_rank(gpucomm* comm, int* rank); * \param offdest memory offset after which data will be saved in * buffer `dest` * \param count number of elements to be reduced in each array - * \param typecode code for elements' data type, see \ref enum - * GPUARRAY_TYPES - * \param opcode reduce operation code, see \ref enum _gpucomm_reduce_ops + * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES + * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops * \param root rank in `comm` which will collect result * \param comm gpu communicator * @@ -166,9 +165,8 @@ GPUARRAY_PUBLIC int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, * \param offdest memory offset after which data will be saved in * buffer `dest` * \param count number of elements to be reduced in each array - * \param typecode code for elements' data type, see \ref enum - * GPUARRAY_TYPES - * \param opcode reduce operation code, see \ref enum _gpucomm_reduce_ops + * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES + * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. @@ -195,9 +193,8 @@ GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc, * \param offdest memory offset after which data will be saved in * buffer `dest` * \param count number of elements to be contained in result `dest` - * \param typecode code for elements' data type, see \ref enum - * GPUARRAY_TYPES - * \param opcode reduce operation code, see \ref enum _gpucomm_reduce_ops + * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES + * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. @@ -218,8 +215,7 @@ GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, * \param array data in device's buffer to get copied or be received * \param offset memory offset after which data in `array` begin * \param count number of elements to be contained in `array` - * \param typecode code for elements' data type, see \ref enum - * GPUARRAY_TYPES + * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES * \param root rank in `comm` which broadcasts its array * \param comm gpu communicator * @@ -243,8 +239,7 @@ GPUARRAY_PUBLIC int gpucomm_broadcast(gpudata* array, size_t offset, * \param offdest memory offset after which data in `dest` begin * \param count number of elements to be gathered from each rank in * `src` - * \param typecode code for elements' data type, see \ref enum - * GPUARRAY_TYPES + * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. diff --git a/src/gpuarray/collectives.h b/src/gpuarray/collectives.h index e9410c4c92..fcc7e65919 100644 --- a/src/gpuarray/collectives.h +++ b/src/gpuarray/collectives.h @@ -12,102 +12,114 @@ extern "C" { } #endif // CONFUSE_EMACS -/******************************************************************************* -* Multi-gpu collectives interface * -*******************************************************************************/ +/***************************************************************************** +* Multi-gpu collectives interface * +******************************************************************************/ /** - * \brief Reduce collective operation for non root participant ranks in a + * Reduce collective operation for non root participant ranks in a * communicator world. - * \param src [const GpuArray*] array to be reduced - * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops - * \param root [int] rank in `comm` which will collect result - * \param comm [gpucomm*] gpu communicator - * \note Root rank of reduce operation must call \ref GpuArray_reduce. + * + * \param src array to be reduced + * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param root rank in `comm` which will collect result + * \param comm gpu communicator + * + * \note Root rank of reduce operation must call GpuArray_reduce(). * \note Must be called separately for each rank in `comm`, except root rank. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce_from(const GpuArray* src, int opcode, int root, gpucomm* comm); /** - * \brief Reduce collective operation for ranks in a communicator world. - * \param src [const GpuArray*] array to be reduced - * \param dest [GpuArray*] array to collect reduce operation result - * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops - * \param root [int] rank in `comm` which will collect result - * \param comm [gpucomm*] gpu communicator + * Reduce collective operation for ranks in a communicator world. + * + * \param src array to be reduced + * \param dest array to collect reduce operation result + * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param root rank in `comm` which will collect result + * \param comm gpu communicator + * * \note Can be used by root and non root ranks alike. + * * \note Non root ranks can call this, using a NULL `dest`. - * \note Must be called separately for each rank in `comm` (non root can call - * \ref - * GpuArray_reduce_from instead). + * + * \note Must be called separately for each rank in `comm` (non root + * can call GpuArray_reduce_from() instead). + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce(const GpuArray* src, GpuArray* dest, int opcode, int root, gpucomm* comm); /** - * \brief AllReduce collective operation for ranks in a communicator world. + * AllReduce collective operation for ranks in a communicator world. * - * Reduces `src` using op operation and leaves identical copies of result in - * `dest` - * on each rank of `comm`. + * Reduces `src` using op operation and leaves identical copies of + * result in `dest` on each rank of `comm`. + * + * \param src array to be reduced + * \param dest array to collect reduce operation result + * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param comm gpu communicator * - * \param src [const GpuArray*] array to be reduced - * \param dest [GpuArray*] array to collect reduce operation result - * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops - * \param comm [gpucomm*] gpu communicator * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_all_reduce(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm); /** - * \brief ReduceScatter collective operation for ranks in a communicator world. + * ReduceScatter collective operation for ranks in a communicator world. * - * Reduces data in `src` using `opcode` operation and leaves reduced result - * scattered - * over `dest` in the user-defined rank order in `comm`. + * Reduces data in `src` using `opcode` operation and leaves reduced + * result scattered over `dest` in the user-defined rank order in + * `comm`. + * + * \param src array to be reduced + * \param dest array to collect reduce operation scattered result + * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param comm gpu communicator * - * \param src [const GpuArray*] array to be reduced - * \param dest [GpuArray*] array to collect reduce operation scattered result - * \param opcode [int] reduce operation code, see \ref enum _gpucomm_reduce_ops - * \param comm [gpucomm*] gpu communicator * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm); /** - * \brief Broadcast collective operation for ranks in a communicator world. + * Broadcast collective operation for ranks in a communicator world. * * Copies `array` to all ranks in `comm`. * - * \param array [GpuArray*] array to be broadcasted, if root rank, else to - * receive - * \param root [int] rank in `comm` which broadcasts its array - * \param comm [gpucomm*] gpu communicator + * \param array array to be broadcasted, if root rank, else to receive + * \param root rank in `comm` which broadcasts its array + * \param comm gpu communicator + * * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_broadcast(GpuArray* array, int root, gpucomm* comm); /** - * \brief AllGather collective operation for ranks in a communicator world. + * AllGather collective operation for ranks in a communicator world. * - * Each rank receives all `src` arrays from every rank in the user-defined rank - * order - * in `comm`. + * Each rank receives all `src` arrays from every rank in the + * user-defined rank order in `comm`. * - * \param src [const GpuArray*] array to be gathered - * \param dest [GpuArray*] array to receive all gathered arrays from ranks in + * \param src array to be gathered + * \param dest array to receive all gathered arrays from ranks in * `comm` - * \param comm [gpucomm*] gpu communicator + * \param comm gpu communicator + * * \note Must be called separately for each rank in `comm`. + * * \return int error code, \ref GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_all_gather(const GpuArray* src, GpuArray* dest, From 0a6813d9bdfaa18057432d2e9cb4f267ebf27211 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 16:20:57 -0500 Subject: [PATCH 282/597] Adjust doc to match code. --- src/gpuarray/kernel.h | 4 +++- src/util/integerfactoring.h | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index f88d74ffc6..1f42c3ed64 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -41,6 +41,8 @@ typedef struct _GpuKernel { * \param strs C array of source code strings * \param lens C array with the size of each string or NULL * \param name name of the kernel function + * \param argcount number of kerner arguments + * \param types typecode for each argument * \param flags kernel use flags (see \ref ga_usefl) * \param err_str (if not NULL) location to write GPU-backend provided debug info * @@ -100,7 +102,7 @@ GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n, * \param n dimensionality of the grid/blocks * \param gs sizes of launch grid * \param ls sizes of launch blocks - * \param amount of dynamic shared memory to allocate + * \param shared of dynamic shared memory to allocate * \param args table of pointers to arguments */ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, diff --git a/src/util/integerfactoring.h b/src/util/integerfactoring.h index 4611ea4c87..0ca4c14f68 100644 --- a/src/util/integerfactoring.h +++ b/src/util/integerfactoring.h @@ -244,9 +244,6 @@ void gaIFLappend(strb *sb, const ga_factor_list* fl); * @param [in,out] factBS The block size for dimensions 0..n-1, as a factor list. * @param [in,out] factGS The grid size for dimensions 0..n-1, as a factor list. * @param [in,out] factCS The chunk size for dimensions 0..n-1, as a factor list. - * @param [in,out] bs The block size for dimensions 0..n-1, as an integer. - * @param [in,out] gs The grid size for dimensions 0..n-1, as an integer. - * @param [in,out] cs The chunk size for dimensions 0..n-1, as an integer. */ void gaIFLSchedule(const int n, From c3583249eddf4211b982f7a4115fc3b2a24fc08c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 16:34:06 -0500 Subject: [PATCH 283/597] Rework the C api docs so it's not all bunched on one page. --- doc/Doxyfile | 3 ++- doc/c_api.rst | 7 +++++-- doc/c_api/file/abi__version_8h.rst | 4 ++++ doc/c_api/file/array_8h.rst | 4 ++++ doc/c_api/file/blas_8h.rst | 4 ++++ doc/c_api/file/buffer_8h.rst | 4 ++++ doc/c_api/file/buffer__blas_8h.rst | 4 ++++ doc/c_api/file/buffer__collectives_8h.rst | 4 ++++ doc/c_api/file/cache_8h.rst | 4 ++++ doc/c_api/file/collectives_8h.rst | 4 ++++ doc/c_api/file/config_8h.rst | 4 ++++ doc/c_api/file/dyn__load_8h.rst | 4 ++++ doc/c_api/file/elemwise_8h.rst | 4 ++++ doc/c_api/file/error_8h.rst | 4 ++++ doc/c_api/file/ext__cuda_8h.rst | 4 ++++ doc/c_api/file/extension_8h.rst | 4 ++++ doc/c_api/file/integerfactoring_8h.rst | 4 ++++ doc/c_api/file/kernel_8h.rst | 4 ++++ doc/c_api/file/libclblas_8h.rst | 4 ++++ doc/c_api/file/libclblast_8h.rst | 4 ++++ doc/c_api/file/libcublas_8h.rst | 4 ++++ doc/c_api/file/libcuda_8h.rst | 4 ++++ doc/c_api/file/libnccl_8h.rst | 4 ++++ doc/c_api/file/libnvrtc_8h.rst | 4 ++++ doc/c_api/file/libopencl_8h.rst | 4 ++++ doc/c_api/file/private_8h.rst | 4 ++++ doc/c_api/file/private__config_8h.rst | 4 ++++ doc/c_api/file/private__cuda_8h.rst | 4 ++++ doc/c_api/file/private__opencl_8h.rst | 4 ++++ doc/c_api/file/strb_8h.rst | 4 ++++ doc/c_api/file/types_8h.rst | 4 ++++ doc/c_api/file/util_8h.rst | 4 ++++ doc/c_api/file/xxhash_8h.rst | 4 ++++ doc/c_api/filelist.rst | 7 +++++++ doc/c_api/group/group__aflags.rst | 4 ++++ doc/c_api/group/group__alloc__flags.rst | 4 ++++ doc/c_api/group/group__context__flags.rst | 4 ++++ doc/c_api/group/group__eflags.rst | 4 ++++ doc/c_api/group/group__elem__call__flags.rst | 4 ++++ doc/c_api/group/group__elem__flags.rst | 4 ++++ doc/c_api/group/group__props.rst | 4 ++++ doc/c_api/grouplist.rst | 7 +++++++ doc/c_api/struct/struct__cache.rst | 4 ++++ doc/c_api/struct/struct__gpudata.rst | 4 ++++ doc/c_api/struct/struct__gpukernel.rst | 4 ++++ doc/c_api/struct/struct_c_uipc_mem_handle.rst | 4 ++++ doc/c_api/struct/struct_gpu_array.rst | 4 ++++ doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst | 4 ++++ doc/c_api/struct/struct_gpu_kernel.rst | 4 ++++ doc/c_api/struct/struct_x_x_h32__state__t.rst | 4 ++++ doc/c_api/struct/structcl__ctx.rst | 4 ++++ doc/c_api/struct/structcuda__context.rst | 4 ++++ doc/c_api/struct/structga__factor__list__.rst | 4 ++++ doc/c_api/struct/structgpuarray__type.rst | 4 ++++ doc/c_api/struct/structgpucomm_clique_id.rst | 4 ++++ doc/c_api/struct/structgpuelemwise__arg.rst | 4 ++++ doc/c_api/struct/structnccl_unique_id.rst | 4 ++++ doc/c_api/struct/structstrb.rst | 4 ++++ doc/c_api/structlist.rst | 7 +++++++ 59 files changed, 244 insertions(+), 3 deletions(-) create mode 100644 doc/c_api/file/abi__version_8h.rst create mode 100644 doc/c_api/file/array_8h.rst create mode 100644 doc/c_api/file/blas_8h.rst create mode 100644 doc/c_api/file/buffer_8h.rst create mode 100644 doc/c_api/file/buffer__blas_8h.rst create mode 100644 doc/c_api/file/buffer__collectives_8h.rst create mode 100644 doc/c_api/file/cache_8h.rst create mode 100644 doc/c_api/file/collectives_8h.rst create mode 100644 doc/c_api/file/config_8h.rst create mode 100644 doc/c_api/file/dyn__load_8h.rst create mode 100644 doc/c_api/file/elemwise_8h.rst create mode 100644 doc/c_api/file/error_8h.rst create mode 100644 doc/c_api/file/ext__cuda_8h.rst create mode 100644 doc/c_api/file/extension_8h.rst create mode 100644 doc/c_api/file/integerfactoring_8h.rst create mode 100644 doc/c_api/file/kernel_8h.rst create mode 100644 doc/c_api/file/libclblas_8h.rst create mode 100644 doc/c_api/file/libclblast_8h.rst create mode 100644 doc/c_api/file/libcublas_8h.rst create mode 100644 doc/c_api/file/libcuda_8h.rst create mode 100644 doc/c_api/file/libnccl_8h.rst create mode 100644 doc/c_api/file/libnvrtc_8h.rst create mode 100644 doc/c_api/file/libopencl_8h.rst create mode 100644 doc/c_api/file/private_8h.rst create mode 100644 doc/c_api/file/private__config_8h.rst create mode 100644 doc/c_api/file/private__cuda_8h.rst create mode 100644 doc/c_api/file/private__opencl_8h.rst create mode 100644 doc/c_api/file/strb_8h.rst create mode 100644 doc/c_api/file/types_8h.rst create mode 100644 doc/c_api/file/util_8h.rst create mode 100644 doc/c_api/file/xxhash_8h.rst create mode 100644 doc/c_api/filelist.rst create mode 100644 doc/c_api/group/group__aflags.rst create mode 100644 doc/c_api/group/group__alloc__flags.rst create mode 100644 doc/c_api/group/group__context__flags.rst create mode 100644 doc/c_api/group/group__eflags.rst create mode 100644 doc/c_api/group/group__elem__call__flags.rst create mode 100644 doc/c_api/group/group__elem__flags.rst create mode 100644 doc/c_api/group/group__props.rst create mode 100644 doc/c_api/grouplist.rst create mode 100644 doc/c_api/struct/struct__cache.rst create mode 100644 doc/c_api/struct/struct__gpudata.rst create mode 100644 doc/c_api/struct/struct__gpukernel.rst create mode 100644 doc/c_api/struct/struct_c_uipc_mem_handle.rst create mode 100644 doc/c_api/struct/struct_gpu_array.rst create mode 100644 doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst create mode 100644 doc/c_api/struct/struct_gpu_kernel.rst create mode 100644 doc/c_api/struct/struct_x_x_h32__state__t.rst create mode 100644 doc/c_api/struct/structcl__ctx.rst create mode 100644 doc/c_api/struct/structcuda__context.rst create mode 100644 doc/c_api/struct/structga__factor__list__.rst create mode 100644 doc/c_api/struct/structgpuarray__type.rst create mode 100644 doc/c_api/struct/structgpucomm_clique_id.rst create mode 100644 doc/c_api/struct/structgpuelemwise__arg.rst create mode 100644 doc/c_api/struct/structnccl_unique_id.rst create mode 100644 doc/c_api/struct/structstrb.rst create mode 100644 doc/c_api/structlist.rst diff --git a/doc/Doxyfile b/doc/Doxyfile index 06b6dd84d4..ab5409d7a7 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -684,7 +684,8 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = ../src/gpuarray/wincompat ../src/gpuarray/compat.h +EXCLUDE = ../src/gpuarray/wincompat \ + ../src/gpuarray/compat.h # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded diff --git a/doc/c_api.rst b/doc/c_api.rst index cf3185f9a2..eb7921f14c 100644 --- a/doc/c_api.rst +++ b/doc/c_api.rst @@ -1,5 +1,8 @@ C library reference =================== -.. doxygenindex:: - :project: gpuarray +.. toctree:: + + c_api/grouplist + c_api/structlist + c_api/filelist diff --git a/doc/c_api/file/abi__version_8h.rst b/doc/c_api/file/abi__version_8h.rst new file mode 100644 index 0000000000..8961e38851 --- /dev/null +++ b/doc/c_api/file/abi__version_8h.rst @@ -0,0 +1,4 @@ +File abi_version.h +================== + +.. doxygenfile:: abi_version.h diff --git a/doc/c_api/file/array_8h.rst b/doc/c_api/file/array_8h.rst new file mode 100644 index 0000000000..5bc914a3ce --- /dev/null +++ b/doc/c_api/file/array_8h.rst @@ -0,0 +1,4 @@ +File array.h +============ + +.. doxygenfile:: array.h diff --git a/doc/c_api/file/blas_8h.rst b/doc/c_api/file/blas_8h.rst new file mode 100644 index 0000000000..a006abdd1e --- /dev/null +++ b/doc/c_api/file/blas_8h.rst @@ -0,0 +1,4 @@ +File blas.h +=========== + +.. doxygenfile:: blas.h diff --git a/doc/c_api/file/buffer_8h.rst b/doc/c_api/file/buffer_8h.rst new file mode 100644 index 0000000000..845af0606e --- /dev/null +++ b/doc/c_api/file/buffer_8h.rst @@ -0,0 +1,4 @@ +File buffer.h +============= + +.. doxygenfile:: buffer.h diff --git a/doc/c_api/file/buffer__blas_8h.rst b/doc/c_api/file/buffer__blas_8h.rst new file mode 100644 index 0000000000..cd654103d9 --- /dev/null +++ b/doc/c_api/file/buffer__blas_8h.rst @@ -0,0 +1,4 @@ +File buffer_blas.h +================== + +.. doxygenfile:: buffer_blas.h diff --git a/doc/c_api/file/buffer__collectives_8h.rst b/doc/c_api/file/buffer__collectives_8h.rst new file mode 100644 index 0000000000..053c71bab6 --- /dev/null +++ b/doc/c_api/file/buffer__collectives_8h.rst @@ -0,0 +1,4 @@ +File buffer_collectives.h +========================= + +.. doxygenfile:: buffer_collectives.h diff --git a/doc/c_api/file/cache_8h.rst b/doc/c_api/file/cache_8h.rst new file mode 100644 index 0000000000..2007da1420 --- /dev/null +++ b/doc/c_api/file/cache_8h.rst @@ -0,0 +1,4 @@ +File cache.h +============ + +.. doxygenfile:: cache.h diff --git a/doc/c_api/file/collectives_8h.rst b/doc/c_api/file/collectives_8h.rst new file mode 100644 index 0000000000..8e65a4367a --- /dev/null +++ b/doc/c_api/file/collectives_8h.rst @@ -0,0 +1,4 @@ +File collectives.h +================== + +.. doxygenfile:: collectives.h diff --git a/doc/c_api/file/config_8h.rst b/doc/c_api/file/config_8h.rst new file mode 100644 index 0000000000..24efd0ead4 --- /dev/null +++ b/doc/c_api/file/config_8h.rst @@ -0,0 +1,4 @@ +File config.h +============= + +.. doxygenfile:: config.h diff --git a/doc/c_api/file/dyn__load_8h.rst b/doc/c_api/file/dyn__load_8h.rst new file mode 100644 index 0000000000..a62f8a5ed2 --- /dev/null +++ b/doc/c_api/file/dyn__load_8h.rst @@ -0,0 +1,4 @@ +File dyn_load.h +=============== + +.. doxygenfile:: dyn_load.h diff --git a/doc/c_api/file/elemwise_8h.rst b/doc/c_api/file/elemwise_8h.rst new file mode 100644 index 0000000000..bb00feb808 --- /dev/null +++ b/doc/c_api/file/elemwise_8h.rst @@ -0,0 +1,4 @@ +File elemwise.h +=============== + +.. doxygenfile:: elemwise.h diff --git a/doc/c_api/file/error_8h.rst b/doc/c_api/file/error_8h.rst new file mode 100644 index 0000000000..c155101143 --- /dev/null +++ b/doc/c_api/file/error_8h.rst @@ -0,0 +1,4 @@ +File error.h +============ + +.. doxygenfile:: error.h diff --git a/doc/c_api/file/ext__cuda_8h.rst b/doc/c_api/file/ext__cuda_8h.rst new file mode 100644 index 0000000000..2ea42ec8a9 --- /dev/null +++ b/doc/c_api/file/ext__cuda_8h.rst @@ -0,0 +1,4 @@ +File ext_cuda.h +=============== + +.. doxygenfile:: ext_cuda.h diff --git a/doc/c_api/file/extension_8h.rst b/doc/c_api/file/extension_8h.rst new file mode 100644 index 0000000000..dcfed38cee --- /dev/null +++ b/doc/c_api/file/extension_8h.rst @@ -0,0 +1,4 @@ +File extension.h +================ + +.. doxygenfile:: extension.h diff --git a/doc/c_api/file/integerfactoring_8h.rst b/doc/c_api/file/integerfactoring_8h.rst new file mode 100644 index 0000000000..14aa37162c --- /dev/null +++ b/doc/c_api/file/integerfactoring_8h.rst @@ -0,0 +1,4 @@ +File integerfactoring.h +======================= + +.. doxygenfile:: integerfactoring.h diff --git a/doc/c_api/file/kernel_8h.rst b/doc/c_api/file/kernel_8h.rst new file mode 100644 index 0000000000..e5e0842696 --- /dev/null +++ b/doc/c_api/file/kernel_8h.rst @@ -0,0 +1,4 @@ +File kernel.h +============= + +.. doxygenfile:: kernel.h diff --git a/doc/c_api/file/libclblas_8h.rst b/doc/c_api/file/libclblas_8h.rst new file mode 100644 index 0000000000..e66d2043cc --- /dev/null +++ b/doc/c_api/file/libclblas_8h.rst @@ -0,0 +1,4 @@ +File libclblas.h +================ + +.. doxygenfile:: libclblas.h diff --git a/doc/c_api/file/libclblast_8h.rst b/doc/c_api/file/libclblast_8h.rst new file mode 100644 index 0000000000..6bb0971391 --- /dev/null +++ b/doc/c_api/file/libclblast_8h.rst @@ -0,0 +1,4 @@ +File libclblast.h +================= + +.. doxygenfile:: libclblast.h diff --git a/doc/c_api/file/libcublas_8h.rst b/doc/c_api/file/libcublas_8h.rst new file mode 100644 index 0000000000..47546c7598 --- /dev/null +++ b/doc/c_api/file/libcublas_8h.rst @@ -0,0 +1,4 @@ +File libcublas.h +================ + +.. doxygenfile:: libcublas.h diff --git a/doc/c_api/file/libcuda_8h.rst b/doc/c_api/file/libcuda_8h.rst new file mode 100644 index 0000000000..fbc0f3219a --- /dev/null +++ b/doc/c_api/file/libcuda_8h.rst @@ -0,0 +1,4 @@ +File libcuda.h +============== + +.. doxygenfile:: libcuda.h diff --git a/doc/c_api/file/libnccl_8h.rst b/doc/c_api/file/libnccl_8h.rst new file mode 100644 index 0000000000..1cb8111268 --- /dev/null +++ b/doc/c_api/file/libnccl_8h.rst @@ -0,0 +1,4 @@ +File libnccl.h +============== + +.. doxygenfile:: libnccl.h diff --git a/doc/c_api/file/libnvrtc_8h.rst b/doc/c_api/file/libnvrtc_8h.rst new file mode 100644 index 0000000000..7949afc243 --- /dev/null +++ b/doc/c_api/file/libnvrtc_8h.rst @@ -0,0 +1,4 @@ +File libnvrtc.h +=============== + +.. doxygenfile:: libnvrtc.h diff --git a/doc/c_api/file/libopencl_8h.rst b/doc/c_api/file/libopencl_8h.rst new file mode 100644 index 0000000000..a2a1b8e786 --- /dev/null +++ b/doc/c_api/file/libopencl_8h.rst @@ -0,0 +1,4 @@ +File libopencl.h +================ + +.. doxygenfile:: libopencl.h diff --git a/doc/c_api/file/private_8h.rst b/doc/c_api/file/private_8h.rst new file mode 100644 index 0000000000..9d6e0c0a03 --- /dev/null +++ b/doc/c_api/file/private_8h.rst @@ -0,0 +1,4 @@ +File private.h +============== + +.. doxygenfile:: private.h diff --git a/doc/c_api/file/private__config_8h.rst b/doc/c_api/file/private__config_8h.rst new file mode 100644 index 0000000000..3fdbb71246 --- /dev/null +++ b/doc/c_api/file/private__config_8h.rst @@ -0,0 +1,4 @@ +File private_config.h +===================== + +.. doxygenfile:: private_config.h diff --git a/doc/c_api/file/private__cuda_8h.rst b/doc/c_api/file/private__cuda_8h.rst new file mode 100644 index 0000000000..4ca763829b --- /dev/null +++ b/doc/c_api/file/private__cuda_8h.rst @@ -0,0 +1,4 @@ +File private_cuda.h +=================== + +.. doxygenfile:: private_cuda.h diff --git a/doc/c_api/file/private__opencl_8h.rst b/doc/c_api/file/private__opencl_8h.rst new file mode 100644 index 0000000000..6e71d1a67a --- /dev/null +++ b/doc/c_api/file/private__opencl_8h.rst @@ -0,0 +1,4 @@ +File private_opencl.h +===================== + +.. doxygenfile:: private_opencl.h diff --git a/doc/c_api/file/strb_8h.rst b/doc/c_api/file/strb_8h.rst new file mode 100644 index 0000000000..a87df4558b --- /dev/null +++ b/doc/c_api/file/strb_8h.rst @@ -0,0 +1,4 @@ +File strb.h +=========== + +.. doxygenfile:: strb.h diff --git a/doc/c_api/file/types_8h.rst b/doc/c_api/file/types_8h.rst new file mode 100644 index 0000000000..b7b6027f14 --- /dev/null +++ b/doc/c_api/file/types_8h.rst @@ -0,0 +1,4 @@ +File types.h +============ + +.. doxygenfile:: types.h diff --git a/doc/c_api/file/util_8h.rst b/doc/c_api/file/util_8h.rst new file mode 100644 index 0000000000..470b783b3d --- /dev/null +++ b/doc/c_api/file/util_8h.rst @@ -0,0 +1,4 @@ +File util.h +=========== + +.. doxygenfile:: util.h diff --git a/doc/c_api/file/xxhash_8h.rst b/doc/c_api/file/xxhash_8h.rst new file mode 100644 index 0000000000..9f69b14389 --- /dev/null +++ b/doc/c_api/file/xxhash_8h.rst @@ -0,0 +1,4 @@ +File xxhash.h +============= + +.. doxygenfile:: xxhash.h diff --git a/doc/c_api/filelist.rst b/doc/c_api/filelist.rst new file mode 100644 index 0000000000..78a5f5378b --- /dev/null +++ b/doc/c_api/filelist.rst @@ -0,0 +1,7 @@ +File list +========= + +.. toctree:: + :glob: + + file/* diff --git a/doc/c_api/group/group__aflags.rst b/doc/c_api/group/group__aflags.rst new file mode 100644 index 0000000000..36b85bf448 --- /dev/null +++ b/doc/c_api/group/group__aflags.rst @@ -0,0 +1,4 @@ +Group aflags +============ + +.. doxygengroup:: aflags diff --git a/doc/c_api/group/group__alloc__flags.rst b/doc/c_api/group/group__alloc__flags.rst new file mode 100644 index 0000000000..efdeb11288 --- /dev/null +++ b/doc/c_api/group/group__alloc__flags.rst @@ -0,0 +1,4 @@ +Group alloc_flags +================= + +.. doxygengroup:: alloc_flags diff --git a/doc/c_api/group/group__context__flags.rst b/doc/c_api/group/group__context__flags.rst new file mode 100644 index 0000000000..cc8073b5ad --- /dev/null +++ b/doc/c_api/group/group__context__flags.rst @@ -0,0 +1,4 @@ +Group context_flags +=================== + +.. doxygengroup:: context_flags diff --git a/doc/c_api/group/group__eflags.rst b/doc/c_api/group/group__eflags.rst new file mode 100644 index 0000000000..dd5b6e7495 --- /dev/null +++ b/doc/c_api/group/group__eflags.rst @@ -0,0 +1,4 @@ +Group eflags +============ + +.. doxygengroup:: eflags diff --git a/doc/c_api/group/group__elem__call__flags.rst b/doc/c_api/group/group__elem__call__flags.rst new file mode 100644 index 0000000000..82193e07bc --- /dev/null +++ b/doc/c_api/group/group__elem__call__flags.rst @@ -0,0 +1,4 @@ +Group elem_call_flags +===================== + +.. doxygengroup:: elem_call_flags diff --git a/doc/c_api/group/group__elem__flags.rst b/doc/c_api/group/group__elem__flags.rst new file mode 100644 index 0000000000..38323267a3 --- /dev/null +++ b/doc/c_api/group/group__elem__flags.rst @@ -0,0 +1,4 @@ +Group elem_flags +================ + +.. doxygengroup:: elem_flags diff --git a/doc/c_api/group/group__props.rst b/doc/c_api/group/group__props.rst new file mode 100644 index 0000000000..90a6db8859 --- /dev/null +++ b/doc/c_api/group/group__props.rst @@ -0,0 +1,4 @@ +Group props +=========== + +.. doxygengroup:: props diff --git a/doc/c_api/grouplist.rst b/doc/c_api/grouplist.rst new file mode 100644 index 0000000000..f63ef2179a --- /dev/null +++ b/doc/c_api/grouplist.rst @@ -0,0 +1,7 @@ +Group list +========== + +.. toctree:: + :glob: + + group/* diff --git a/doc/c_api/struct/struct__cache.rst b/doc/c_api/struct/struct__cache.rst new file mode 100644 index 0000000000..a320304595 --- /dev/null +++ b/doc/c_api/struct/struct__cache.rst @@ -0,0 +1,4 @@ +Struct _cache +============= + +.. doxygenstruct:: _cache diff --git a/doc/c_api/struct/struct__gpudata.rst b/doc/c_api/struct/struct__gpudata.rst new file mode 100644 index 0000000000..26f8cada5c --- /dev/null +++ b/doc/c_api/struct/struct__gpudata.rst @@ -0,0 +1,4 @@ +Struct _gpudata +=============== + +.. doxygenstruct:: _gpudata diff --git a/doc/c_api/struct/struct__gpukernel.rst b/doc/c_api/struct/struct__gpukernel.rst new file mode 100644 index 0000000000..e0da283eb5 --- /dev/null +++ b/doc/c_api/struct/struct__gpukernel.rst @@ -0,0 +1,4 @@ +Struct _gpukernel +================= + +.. doxygenstruct:: _gpukernel diff --git a/doc/c_api/struct/struct_c_uipc_mem_handle.rst b/doc/c_api/struct/struct_c_uipc_mem_handle.rst new file mode 100644 index 0000000000..85861c3f2a --- /dev/null +++ b/doc/c_api/struct/struct_c_uipc_mem_handle.rst @@ -0,0 +1,4 @@ +Struct CUipcMemHandle +===================== + +.. doxygenstruct:: CUipcMemHandle diff --git a/doc/c_api/struct/struct_gpu_array.rst b/doc/c_api/struct/struct_gpu_array.rst new file mode 100644 index 0000000000..f78a96907a --- /dev/null +++ b/doc/c_api/struct/struct_gpu_array.rst @@ -0,0 +1,4 @@ +Struct GpuArray +=============== + +.. doxygenstruct:: GpuArray diff --git a/doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst b/doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst new file mode 100644 index 0000000000..d33163598d --- /dev/null +++ b/doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst @@ -0,0 +1,4 @@ +Struct GpuArrayIpcMemHandle +=========================== + +.. doxygenstruct:: GpuArrayIpcMemHandle diff --git a/doc/c_api/struct/struct_gpu_kernel.rst b/doc/c_api/struct/struct_gpu_kernel.rst new file mode 100644 index 0000000000..67163dce93 --- /dev/null +++ b/doc/c_api/struct/struct_gpu_kernel.rst @@ -0,0 +1,4 @@ +Struct GpuKernel +================ + +.. doxygenstruct:: GpuKernel diff --git a/doc/c_api/struct/struct_x_x_h32__state__t.rst b/doc/c_api/struct/struct_x_x_h32__state__t.rst new file mode 100644 index 0000000000..ffa20d5c37 --- /dev/null +++ b/doc/c_api/struct/struct_x_x_h32__state__t.rst @@ -0,0 +1,4 @@ +Struct XXH32_state_t +==================== + +.. doxygenstruct:: XXH32_state_t diff --git a/doc/c_api/struct/structcl__ctx.rst b/doc/c_api/struct/structcl__ctx.rst new file mode 100644 index 0000000000..0134ea831a --- /dev/null +++ b/doc/c_api/struct/structcl__ctx.rst @@ -0,0 +1,4 @@ +Struct cl_ctx +============= + +.. doxygenstruct:: cl_ctx diff --git a/doc/c_api/struct/structcuda__context.rst b/doc/c_api/struct/structcuda__context.rst new file mode 100644 index 0000000000..7287747c61 --- /dev/null +++ b/doc/c_api/struct/structcuda__context.rst @@ -0,0 +1,4 @@ +Struct cuda_context +=================== + +.. doxygenstruct:: cuda_context diff --git a/doc/c_api/struct/structga__factor__list__.rst b/doc/c_api/struct/structga__factor__list__.rst new file mode 100644 index 0000000000..12b6ad9ad2 --- /dev/null +++ b/doc/c_api/struct/structga__factor__list__.rst @@ -0,0 +1,4 @@ +Struct ga_factor_list_ +====================== + +.. doxygenstruct:: ga_factor_list_ diff --git a/doc/c_api/struct/structgpuarray__type.rst b/doc/c_api/struct/structgpuarray__type.rst new file mode 100644 index 0000000000..3398c760b4 --- /dev/null +++ b/doc/c_api/struct/structgpuarray__type.rst @@ -0,0 +1,4 @@ +Struct gpuarray_type +==================== + +.. doxygenstruct:: gpuarray_type diff --git a/doc/c_api/struct/structgpucomm_clique_id.rst b/doc/c_api/struct/structgpucomm_clique_id.rst new file mode 100644 index 0000000000..890630f3db --- /dev/null +++ b/doc/c_api/struct/structgpucomm_clique_id.rst @@ -0,0 +1,4 @@ +Struct gpucommCliqueId +====================== + +.. doxygenstruct:: gpucommCliqueId diff --git a/doc/c_api/struct/structgpuelemwise__arg.rst b/doc/c_api/struct/structgpuelemwise__arg.rst new file mode 100644 index 0000000000..0266b89f52 --- /dev/null +++ b/doc/c_api/struct/structgpuelemwise__arg.rst @@ -0,0 +1,4 @@ +Struct gpuelemwise_arg +====================== + +.. doxygenstruct:: gpuelemwise_arg diff --git a/doc/c_api/struct/structnccl_unique_id.rst b/doc/c_api/struct/structnccl_unique_id.rst new file mode 100644 index 0000000000..4367a36a22 --- /dev/null +++ b/doc/c_api/struct/structnccl_unique_id.rst @@ -0,0 +1,4 @@ +Struct ncclUniqueId +=================== + +.. doxygenstruct:: ncclUniqueId diff --git a/doc/c_api/struct/structstrb.rst b/doc/c_api/struct/structstrb.rst new file mode 100644 index 0000000000..25d37a4bb0 --- /dev/null +++ b/doc/c_api/struct/structstrb.rst @@ -0,0 +1,4 @@ +Struct strb +=========== + +.. doxygenstruct:: strb diff --git a/doc/c_api/structlist.rst b/doc/c_api/structlist.rst new file mode 100644 index 0000000000..fc53f2fc04 --- /dev/null +++ b/doc/c_api/structlist.rst @@ -0,0 +1,7 @@ +Struct list +=========== + +.. toctree:: + :glob: + + struct/* From 4f326ad1ba5d4496c000a2461c9d53b0489eb381 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 16:56:46 -0500 Subject: [PATCH 284/597] Hide documentation on internal stuff --- doc/Doxyfile | 3 ++- doc/c_api/file/private__config_8h.rst | 4 ---- src/loaders/libclblas.h | 6 ++++++ src/loaders/libclblast.h | 8 ++++++++ src/loaders/libcublas.h | 8 ++++++++ src/loaders/libcuda.h | 8 ++++++++ src/loaders/libnccl.h | 8 ++++++++ src/loaders/libnvrtc.h | 8 ++++++++ src/loaders/libopencl.h | 8 ++++++++ 9 files changed, 56 insertions(+), 5 deletions(-) delete mode 100644 doc/c_api/file/private__config_8h.rst diff --git a/doc/Doxyfile b/doc/Doxyfile index ab5409d7a7..f257e90bf0 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -685,7 +685,8 @@ RECURSIVE = YES # run. EXCLUDE = ../src/gpuarray/wincompat \ - ../src/gpuarray/compat.h + ../src/gpuarray/compat.h \ + ../src/private_config.h # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded diff --git a/doc/c_api/file/private__config_8h.rst b/doc/c_api/file/private__config_8h.rst deleted file mode 100644 index 3fdbb71246..0000000000 --- a/doc/c_api/file/private__config_8h.rst +++ /dev/null @@ -1,4 +0,0 @@ -File private_config.h -===================== - -.. doxygenfile:: private_config.h diff --git a/src/loaders/libclblas.h b/src/loaders/libclblas.h index ccdee19983..b30409bd81 100644 --- a/src/loaders/libclblas.h +++ b/src/loaders/libclblas.h @@ -4,6 +4,7 @@ #include "util/error.h" #include "libopencl.h" +/** @cond NEVER */ typedef enum clblasOrder_ { clblasRowMajor, clblasColumnMajor @@ -38,8 +39,11 @@ typedef enum clblasStatus_ { clblasInsufficientMemVecY, } clblasStatus; +/** @endcond */ + int load_libclblas(error *); +/** @cond NEVER */ #define DEF_PROC(ret, name, args) typedef ret t##name args #include "libclblas.fn" @@ -52,4 +56,6 @@ int load_libclblas(error *); #undef DEF_PROC +/** @endcond */ + #endif diff --git a/src/loaders/libclblast.h b/src/loaders/libclblast.h index 0608250631..07ea817260 100644 --- a/src/loaders/libclblast.h +++ b/src/loaders/libclblast.h @@ -4,6 +4,8 @@ #include "util/error.h" #include "libopencl.h" +/** @cond NEVER */ + typedef enum Layout_ { kRowMajor = 101, kColMajor = 102 @@ -46,8 +48,12 @@ typedef enum CLBLastStatusCode_ { CLBlastUnexpectedError = -2039, } CLBlastStatusCode; +/** @endcond */ + int load_libclblast(error *); +/** @cond NEVER */ + #define DEF_PROC(ret, name, args) typedef ret t##name args #include "libclblast.fn" @@ -60,4 +66,6 @@ int load_libclblast(error *); #undef DEF_PROC +/** @endcond */ + #endif diff --git a/src/loaders/libcublas.h b/src/loaders/libcublas.h index 83a6c8030f..a0cf9e5084 100644 --- a/src/loaders/libcublas.h +++ b/src/loaders/libcublas.h @@ -3,6 +3,8 @@ #include "util/error.h" +/** @cond NEVER */ + #ifdef _WIN32 #define CUBLASWINAPI __stdcall #else @@ -60,8 +62,12 @@ typedef enum { typedef struct cublasContext *cublasHandle_t; +/** @endcond */ + int load_libcublas(int major, int minor, error *e); +/** @cond NEVER */ + #define DEF_PROC(name, args) typedef cublasStatus_t CUBLASWINAPI t##name args #define DEF_PROC_V2(name, args) DEF_PROC(name, args) #define DEF_PROC_OPT(name, args) DEF_PROC(name, args) @@ -82,4 +88,6 @@ int load_libcublas(int major, int minor, error *e); #undef DEF_PROC_V2 #undef DEF_PROC +/** @endcond */ + #endif diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h index 54b0f51c61..c8fc7ca968 100644 --- a/src/loaders/libcuda.h +++ b/src/loaders/libcuda.h @@ -3,6 +3,8 @@ #include "util/error.h" +/** @cond NEVER */ + #ifdef _WIN32 #define CUDAAPI __stdcall #else @@ -41,8 +43,12 @@ typedef struct CUipcMemHandle_st { char reserved[CU_IPC_HANDLE_SIZE]; } CUipcMemHandle; +/** @endcond */ + int load_libcuda(error *); +/** @cond NEVER */ + #define DEF_PROC(name, args) typedef CUresult CUDAAPI t##name args #define DEF_PROC_V2(name, args) DEF_PROC(name, args) @@ -219,4 +225,6 @@ enum CUjitInputType_enum { CU_JIT_NUM_INPUT_TYPES }; +/** @endcond */ + #endif diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h index 8efb694b42..0139878c8f 100644 --- a/src/loaders/libnccl.h +++ b/src/loaders/libnccl.h @@ -3,6 +3,8 @@ #include "util/error.h" +/** @cond NEVER */ + typedef struct CUstream_st *cudaStream_t; typedef struct ncclComm* ncclComm_t; @@ -27,8 +29,12 @@ typedef enum { ncclChar = 0, ncclUint64 = 6, nccl_NUM_TYPES = 7 } ncclDataType_t; +/** @endcond */ + int load_libnccl(error *e); +/* @cond NEVER */ + #define DEF_PROC(ret, name, args) typedef ret t##name args #include "libnccl.fn" @@ -41,4 +47,6 @@ int load_libnccl(error *e); #undef DEF_PROC +/** @endcond */ + #endif diff --git a/src/loaders/libnvrtc.h b/src/loaders/libnvrtc.h index 5018830b4e..4dc2f3ec67 100644 --- a/src/loaders/libnvrtc.h +++ b/src/loaders/libnvrtc.h @@ -3,14 +3,20 @@ #include "util/error.h" +/** @cond NEVER */ + typedef enum { NVRTC_SUCCESS = 0, } nvrtcResult; typedef struct _nvrtcProgram *nvrtcProgram; +/** @endcond */ + int load_libnvrtc(int major, int minor, error *e); +/** @cond NEVER */ + #define DEF_PROC(rt, name, args) typedef rt t##name args #include "libnvrtc.fn" @@ -23,4 +29,6 @@ int load_libnvrtc(int major, int minor, error *e); #undef DEF_PROC +/** @endcond */ + #endif diff --git a/src/loaders/libopencl.h b/src/loaders/libopencl.h index f2a5727cf6..9ed6f513d8 100644 --- a/src/loaders/libopencl.h +++ b/src/loaders/libopencl.h @@ -3,6 +3,8 @@ #include "util/error.h" +/** @cond NEVER */ + #if defined(_WIN32) #define CL_API_CALL __stdcall #define CL_CALLBACK __stdcall @@ -56,8 +58,12 @@ typedef cl_uint cl_program_build_info; typedef cl_uint cl_kernel_info; typedef cl_uint cl_kernel_work_group_info; +/** @endcond */ + int load_libopencl(error *); +/** @cond NEVER */ + #define DEF_PROC(ret, name, args) typedef ret CL_API_CALL t##name args #include "libopencl.fn" @@ -326,4 +332,6 @@ int load_libopencl(error *); #define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 #define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 +/** @endcond */ + #endif From 5a32c43149408dacb1cc8d344470240042927047 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 17:03:03 -0500 Subject: [PATCH 285/597] Remove GA_POINTER, it won't work with anything anyway. --- src/gen_types.py | 1 - src/gpuarray/types.h | 1 - src/gpuarray_buffer_opencl.c | 2 -- 3 files changed, 4 deletions(-) diff --git a/src/gen_types.py b/src/gen_types.py index 3e14c9a4f2..0e87fe23f3 100644 --- a/src/gen_types.py +++ b/src/gen_types.py @@ -165,7 +165,6 @@ def add_type(name, sz): * List of all built-in types. */ enum GPUARRAY_TYPES { - GA_POINTER = -2, GA_BUFFER = -1, % for i, v in sorted(TYPEMAP.items()): GA_${v[1].upper()} = ${i}, diff --git a/src/gpuarray/types.h b/src/gpuarray/types.h index afd0df16e4..2fac29bb37 100644 --- a/src/gpuarray/types.h +++ b/src/gpuarray/types.h @@ -43,7 +43,6 @@ typedef struct _gpuarray_type { * List of all built-in types. */ enum GPUARRAY_TYPES { - GA_POINTER = -2, GA_BUFFER = -1, GA_BOOL = 0, GA_BYTE = 1, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 812d796920..e1491eeffa 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1017,8 +1017,6 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) { cl_ulong temp; cl_long stemp; switch (k->types[i]) { - case GA_POINTER: - return error_set(ctx->err, GA_DEVSUP_ERROR, "Cannot set raw pointers as kernel arguments"); case GA_BUFFER: btmp = (gpudata *)a; CL_CHECK(ctx->err, clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf)); From 787db0eb087d029ea37a6317773c69887b589e7e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 17:08:57 -0500 Subject: [PATCH 286/597] Hide the definitions in ext_cuda as they aren't the "main" ones. --- src/gpuarray/ext_cuda.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h index 689d0aa60a..4b6377fa2b 100644 --- a/src/gpuarray/ext_cuda.h +++ b/src/gpuarray/ext_cuda.h @@ -11,6 +11,7 @@ extern "C" { #endif +/** @cond NEVER */ static void (*cuda_enter)(gpucontext *); static void (*cuda_exit)(gpucontext *); static gpucontext *(*cuda_make_ctx)(CUcontext, int); @@ -22,6 +23,7 @@ static int (*cuda_record)(gpudata *, int); static CUipcMemHandle (*cuda_get_ipc_handle)(gpudata *d); static gpudata *(*cuda_open_ipc_handle)(gpucontext *c, CUipcMemHandle h, size_t sz); +/** @endcond */ static void setup_ext_cuda(void) { // The casts are necessary to reassure C++ compilers From 103c2a018e05ec26bddb7cfb94ab6466baff0a3c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 17:49:22 -0500 Subject: [PATCH 287/597] Get rid of part of the duplicate IDs. --- doc/c_api/group/group__aflags.rst | 1 + doc/c_api/group/group__alloc__flags.rst | 1 + doc/c_api/group/group__context__flags.rst | 1 + doc/c_api/group/group__eflags.rst | 1 + doc/c_api/group/group__elem__call__flags.rst | 1 + doc/c_api/group/group__elem__flags.rst | 1 + doc/c_api/group/group__props.rst | 1 + 7 files changed, 7 insertions(+) diff --git a/doc/c_api/group/group__aflags.rst b/doc/c_api/group/group__aflags.rst index 36b85bf448..352d021f77 100644 --- a/doc/c_api/group/group__aflags.rst +++ b/doc/c_api/group/group__aflags.rst @@ -2,3 +2,4 @@ Group aflags ============ .. doxygengroup:: aflags + :no-link: diff --git a/doc/c_api/group/group__alloc__flags.rst b/doc/c_api/group/group__alloc__flags.rst index efdeb11288..bdfb08cd9f 100644 --- a/doc/c_api/group/group__alloc__flags.rst +++ b/doc/c_api/group/group__alloc__flags.rst @@ -2,3 +2,4 @@ Group alloc_flags ================= .. doxygengroup:: alloc_flags + :no-link: diff --git a/doc/c_api/group/group__context__flags.rst b/doc/c_api/group/group__context__flags.rst index cc8073b5ad..7833253c3f 100644 --- a/doc/c_api/group/group__context__flags.rst +++ b/doc/c_api/group/group__context__flags.rst @@ -2,3 +2,4 @@ Group context_flags =================== .. doxygengroup:: context_flags + :no-link: diff --git a/doc/c_api/group/group__eflags.rst b/doc/c_api/group/group__eflags.rst index dd5b6e7495..4e7ae16da3 100644 --- a/doc/c_api/group/group__eflags.rst +++ b/doc/c_api/group/group__eflags.rst @@ -2,3 +2,4 @@ Group eflags ============ .. doxygengroup:: eflags + :no-link: diff --git a/doc/c_api/group/group__elem__call__flags.rst b/doc/c_api/group/group__elem__call__flags.rst index 82193e07bc..8aece59927 100644 --- a/doc/c_api/group/group__elem__call__flags.rst +++ b/doc/c_api/group/group__elem__call__flags.rst @@ -2,3 +2,4 @@ Group elem_call_flags ===================== .. doxygengroup:: elem_call_flags + :no-link: diff --git a/doc/c_api/group/group__elem__flags.rst b/doc/c_api/group/group__elem__flags.rst index 38323267a3..9134988bd3 100644 --- a/doc/c_api/group/group__elem__flags.rst +++ b/doc/c_api/group/group__elem__flags.rst @@ -2,3 +2,4 @@ Group elem_flags ================ .. doxygengroup:: elem_flags + :no-link: diff --git a/doc/c_api/group/group__props.rst b/doc/c_api/group/group__props.rst index 90a6db8859..cf950efe80 100644 --- a/doc/c_api/group/group__props.rst +++ b/doc/c_api/group/group__props.rst @@ -2,3 +2,4 @@ Group props =========== .. doxygengroup:: props + :no-link: From 6cbf1e33bee1f1b18ec8493ea6793715e073c754 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 17:53:11 -0500 Subject: [PATCH 288/597] Struct listing is useless and the source of duplicate IDs --- doc/c_api.rst | 1 - doc/c_api/struct/struct__cache.rst | 4 ---- doc/c_api/struct/struct__gpudata.rst | 4 ---- doc/c_api/struct/struct__gpukernel.rst | 4 ---- doc/c_api/struct/struct_c_uipc_mem_handle.rst | 4 ---- doc/c_api/struct/struct_gpu_array.rst | 4 ---- doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst | 4 ---- doc/c_api/struct/struct_gpu_kernel.rst | 4 ---- doc/c_api/struct/struct_x_x_h32__state__t.rst | 4 ---- doc/c_api/struct/structcl__ctx.rst | 4 ---- doc/c_api/struct/structcuda__context.rst | 4 ---- doc/c_api/struct/structga__factor__list__.rst | 4 ---- doc/c_api/struct/structgpuarray__type.rst | 4 ---- doc/c_api/struct/structgpucomm_clique_id.rst | 4 ---- doc/c_api/struct/structgpuelemwise__arg.rst | 4 ---- doc/c_api/struct/structnccl_unique_id.rst | 4 ---- doc/c_api/struct/structstrb.rst | 4 ---- doc/c_api/structlist.rst | 7 ------- 18 files changed, 72 deletions(-) delete mode 100644 doc/c_api/struct/struct__cache.rst delete mode 100644 doc/c_api/struct/struct__gpudata.rst delete mode 100644 doc/c_api/struct/struct__gpukernel.rst delete mode 100644 doc/c_api/struct/struct_c_uipc_mem_handle.rst delete mode 100644 doc/c_api/struct/struct_gpu_array.rst delete mode 100644 doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst delete mode 100644 doc/c_api/struct/struct_gpu_kernel.rst delete mode 100644 doc/c_api/struct/struct_x_x_h32__state__t.rst delete mode 100644 doc/c_api/struct/structcl__ctx.rst delete mode 100644 doc/c_api/struct/structcuda__context.rst delete mode 100644 doc/c_api/struct/structga__factor__list__.rst delete mode 100644 doc/c_api/struct/structgpuarray__type.rst delete mode 100644 doc/c_api/struct/structgpucomm_clique_id.rst delete mode 100644 doc/c_api/struct/structgpuelemwise__arg.rst delete mode 100644 doc/c_api/struct/structnccl_unique_id.rst delete mode 100644 doc/c_api/struct/structstrb.rst delete mode 100644 doc/c_api/structlist.rst diff --git a/doc/c_api.rst b/doc/c_api.rst index eb7921f14c..d7bbadd5b6 100644 --- a/doc/c_api.rst +++ b/doc/c_api.rst @@ -4,5 +4,4 @@ C library reference .. toctree:: c_api/grouplist - c_api/structlist c_api/filelist diff --git a/doc/c_api/struct/struct__cache.rst b/doc/c_api/struct/struct__cache.rst deleted file mode 100644 index a320304595..0000000000 --- a/doc/c_api/struct/struct__cache.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct _cache -============= - -.. doxygenstruct:: _cache diff --git a/doc/c_api/struct/struct__gpudata.rst b/doc/c_api/struct/struct__gpudata.rst deleted file mode 100644 index 26f8cada5c..0000000000 --- a/doc/c_api/struct/struct__gpudata.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct _gpudata -=============== - -.. doxygenstruct:: _gpudata diff --git a/doc/c_api/struct/struct__gpukernel.rst b/doc/c_api/struct/struct__gpukernel.rst deleted file mode 100644 index e0da283eb5..0000000000 --- a/doc/c_api/struct/struct__gpukernel.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct _gpukernel -================= - -.. doxygenstruct:: _gpukernel diff --git a/doc/c_api/struct/struct_c_uipc_mem_handle.rst b/doc/c_api/struct/struct_c_uipc_mem_handle.rst deleted file mode 100644 index 85861c3f2a..0000000000 --- a/doc/c_api/struct/struct_c_uipc_mem_handle.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct CUipcMemHandle -===================== - -.. doxygenstruct:: CUipcMemHandle diff --git a/doc/c_api/struct/struct_gpu_array.rst b/doc/c_api/struct/struct_gpu_array.rst deleted file mode 100644 index f78a96907a..0000000000 --- a/doc/c_api/struct/struct_gpu_array.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct GpuArray -=============== - -.. doxygenstruct:: GpuArray diff --git a/doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst b/doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst deleted file mode 100644 index d33163598d..0000000000 --- a/doc/c_api/struct/struct_gpu_array_ipc_mem_handle.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct GpuArrayIpcMemHandle -=========================== - -.. doxygenstruct:: GpuArrayIpcMemHandle diff --git a/doc/c_api/struct/struct_gpu_kernel.rst b/doc/c_api/struct/struct_gpu_kernel.rst deleted file mode 100644 index 67163dce93..0000000000 --- a/doc/c_api/struct/struct_gpu_kernel.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct GpuKernel -================ - -.. doxygenstruct:: GpuKernel diff --git a/doc/c_api/struct/struct_x_x_h32__state__t.rst b/doc/c_api/struct/struct_x_x_h32__state__t.rst deleted file mode 100644 index ffa20d5c37..0000000000 --- a/doc/c_api/struct/struct_x_x_h32__state__t.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct XXH32_state_t -==================== - -.. doxygenstruct:: XXH32_state_t diff --git a/doc/c_api/struct/structcl__ctx.rst b/doc/c_api/struct/structcl__ctx.rst deleted file mode 100644 index 0134ea831a..0000000000 --- a/doc/c_api/struct/structcl__ctx.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct cl_ctx -============= - -.. doxygenstruct:: cl_ctx diff --git a/doc/c_api/struct/structcuda__context.rst b/doc/c_api/struct/structcuda__context.rst deleted file mode 100644 index 7287747c61..0000000000 --- a/doc/c_api/struct/structcuda__context.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct cuda_context -=================== - -.. doxygenstruct:: cuda_context diff --git a/doc/c_api/struct/structga__factor__list__.rst b/doc/c_api/struct/structga__factor__list__.rst deleted file mode 100644 index 12b6ad9ad2..0000000000 --- a/doc/c_api/struct/structga__factor__list__.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct ga_factor_list_ -====================== - -.. doxygenstruct:: ga_factor_list_ diff --git a/doc/c_api/struct/structgpuarray__type.rst b/doc/c_api/struct/structgpuarray__type.rst deleted file mode 100644 index 3398c760b4..0000000000 --- a/doc/c_api/struct/structgpuarray__type.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct gpuarray_type -==================== - -.. doxygenstruct:: gpuarray_type diff --git a/doc/c_api/struct/structgpucomm_clique_id.rst b/doc/c_api/struct/structgpucomm_clique_id.rst deleted file mode 100644 index 890630f3db..0000000000 --- a/doc/c_api/struct/structgpucomm_clique_id.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct gpucommCliqueId -====================== - -.. doxygenstruct:: gpucommCliqueId diff --git a/doc/c_api/struct/structgpuelemwise__arg.rst b/doc/c_api/struct/structgpuelemwise__arg.rst deleted file mode 100644 index 0266b89f52..0000000000 --- a/doc/c_api/struct/structgpuelemwise__arg.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct gpuelemwise_arg -====================== - -.. doxygenstruct:: gpuelemwise_arg diff --git a/doc/c_api/struct/structnccl_unique_id.rst b/doc/c_api/struct/structnccl_unique_id.rst deleted file mode 100644 index 4367a36a22..0000000000 --- a/doc/c_api/struct/structnccl_unique_id.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct ncclUniqueId -=================== - -.. doxygenstruct:: ncclUniqueId diff --git a/doc/c_api/struct/structstrb.rst b/doc/c_api/struct/structstrb.rst deleted file mode 100644 index 25d37a4bb0..0000000000 --- a/doc/c_api/struct/structstrb.rst +++ /dev/null @@ -1,4 +0,0 @@ -Struct strb -=========== - -.. doxygenstruct:: strb diff --git a/doc/c_api/structlist.rst b/doc/c_api/structlist.rst deleted file mode 100644 index fc53f2fc04..0000000000 --- a/doc/c_api/structlist.rst +++ /dev/null @@ -1,7 +0,0 @@ -Struct list -=========== - -.. toctree:: - :glob: - - struct/* From d20f98f825b938960f07c9bb94ba8fe6d96bdd73 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 17:56:21 -0500 Subject: [PATCH 289/597] Hide one copy of all the debug stuff. --- src/private_opencl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/private_opencl.h b/src/private_opencl.h index a0d2620917..e47c5fc3c6 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -5,6 +5,7 @@ #include "loaders/libopencl.h" +/** @cond NEVER */ #ifdef DEBUG #include @@ -29,6 +30,7 @@ #define ASSERT_KER(k) #define CLEAR(o) #endif +/** @endcond */ const char *cl_error_string(cl_int); From bda07f583553436203f1699374a49f28cde4acc6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 17:58:48 -0500 Subject: [PATCH 290/597] Hide the static asserts since it confuses doxygen. --- src/private_cuda.h | 2 ++ src/private_opencl.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/private_cuda.h b/src/private_cuda.h index 5e6ba7a46e..8903f8a5a8 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -80,8 +80,10 @@ typedef struct _cuda_context { unsigned char minor; } cuda_context; +/** @cond NEVER */ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext), sizeof_struct_gpucontext_cuda); +/** @endcond */ /* * About freeblocks. diff --git a/src/private_opencl.h b/src/private_opencl.h index e47c5fc3c6..b3aed92a25 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -74,7 +74,9 @@ typedef struct _cl_ctx { char *preamble; } cl_ctx; +/** @cond NEVER */ STATIC_ASSERT(sizeof(cl_ctx) <= sizeof(gpucontext), sizeof_struct_gpucontext_cl); +/** @endcond */ struct _gpudata { cl_mem buf; From 56c67551611e9c18e19de29dd4ae56610db7eeb7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 18:02:24 -0500 Subject: [PATCH 291/597] Fix list in install docs. --- doc/installation.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index a9e0bb5ffd..aac4ff716c 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -33,12 +33,14 @@ of this available, but you won't be able to use associated functionality. * For CUDA: + - CUDA (cuda_) version 7.0 or more, with the appropriate driver - (optional) NCCL (nccl_) for the collectives interface * For OpenCL: - - OpenCL version 1.1 or more - - (optional) clBLAS (_clblas) or CLBlast (_clblast) for blas functionality + + - OpenCL version 1.1 or more + - (optional) clBLAS (clblas_) or CLBlast (clblast_) for blas functionality Download -------- From 4a65f8db432b8de452b7f30f369470173e519cbf Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 21 Feb 2017 18:11:43 -0500 Subject: [PATCH 292/597] Rework the python api doc structure. --- doc/index.rst | 2 +- doc/pyapi.rst | 20 ------------ doc/pyapi/pygpu.rst | 74 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 21 deletions(-) delete mode 100644 doc/pyapi.rst create mode 100644 doc/pyapi/pygpu.rst diff --git a/doc/index.rst b/doc/index.rst index 79496a86cd..86524aec29 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -13,7 +13,7 @@ Contents: why installation - pyapi + pyapi/pygpu c_api Indices and tables diff --git a/doc/pyapi.rst b/doc/pyapi.rst deleted file mode 100644 index 8e3f5b3c44..0000000000 --- a/doc/pyapi.rst +++ /dev/null @@ -1,20 +0,0 @@ -Python module reference -======================= - -.. automodule:: pygpu - :members: - - .. automodule:: pygpu.gpuarray - :members: - - .. automodule:: pygpu.elemwise - :members: ElemwiseKernel - - .. automodule:: pygpu.reduction - :members: ReductionKernel - - .. automodule:: pygpu._array - :members: - - .. automodule:: pygpu.collectives - :members: diff --git a/doc/pyapi/pygpu.rst b/doc/pyapi/pygpu.rst new file mode 100644 index 0000000000..5097e8ea3e --- /dev/null +++ b/doc/pyapi/pygpu.rst @@ -0,0 +1,74 @@ +pygpu package +============= + +pygpu.gpuarray module +--------------------- + +.. automodule:: pygpu.gpuarray + :members: + :undoc-members: + :show-inheritance: + +pygpu.elemwise module +--------------------- + +.. automodule:: pygpu.elemwise + :members: + :undoc-members: + :show-inheritance: + +pygpu.operations module +----------------------- + +.. automodule:: pygpu.operations + :members: + :undoc-members: + :show-inheritance: + +pygpu.reduction module +---------------------- + +.. automodule:: pygpu.reduction + :members: + :undoc-members: + :show-inheritance: + +pygpu.blas module +----------------- + +.. automodule:: pygpu.blas + :members: + :undoc-members: + :show-inheritance: + +pygpu.collectives module +------------------------ + +.. automodule:: pygpu.collectives + :members: + :undoc-members: + :show-inheritance: + +pygpu.dtypes module +------------------- + +.. automodule:: pygpu.dtypes + :members: + :undoc-members: + :show-inheritance: + +pygpu.tools module +------------------ + +.. automodule:: pygpu.tools + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: pygpu + :members: + :undoc-members: + :show-inheritance: From daddee9e697e00e79dcbab3b4977da4b2cc79301 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Feb 2017 17:09:29 -0500 Subject: [PATCH 293/597] Switch theme and fix some minor problems. --- doc/_static/fix_rtd.css | 11 ++++ doc/_static/version_switch.js | 104 ++++++++++++++++++++++++++++++++++ doc/_templates/layout.html | 39 +++++++++++++ doc/conf.py | 73 +++++++++++++++++++----- doc/installation.rst | 7 ++- pygpu/collectives.pyx | 48 ++++++++++------ 6 files changed, 250 insertions(+), 32 deletions(-) create mode 100644 doc/_static/fix_rtd.css create mode 100644 doc/_static/version_switch.js create mode 100644 doc/_templates/layout.html diff --git a/doc/_static/fix_rtd.css b/doc/_static/fix_rtd.css new file mode 100644 index 0000000000..7ba86db2ad --- /dev/null +++ b/doc/_static/fix_rtd.css @@ -0,0 +1,11 @@ +/* work around https://github.com/snide/sphinx_rtd_theme/issues/149 */ +.rst-content table.field-list .field-body { + padding-top: 8px; +} +.rst-versions-up { + cursor: pointer; + display: inline; +} +.wy-side-nav-search>div.version { + color: white; +} \ No newline at end of file diff --git a/doc/_static/version_switch.js b/doc/_static/version_switch.js new file mode 100644 index 0000000000..6c5c338381 --- /dev/null +++ b/doc/_static/version_switch.js @@ -0,0 +1,104 @@ +// Create version selector for documentation top bar. +(function() { + + var url = window.location.href; + var base_dir = 'libgpuarray'; // directory containing doc + // Default theano version: release and development. + var versions_dir = {"release": "libgpuarray", "dev": "libgpuarray_versions/dev"}; + + // If doc is run localy + if (url.startsWith('file')) { + base_dir = 'html'; + versions_dir = {"local":"html", "test":"test"}; + } + + var root_url = url.substring(0, url.search('/' + base_dir)) + '/'; + + // Regular expression to find theano version directory in URL. + var version_regex = new RegExp("\\/" + base_dir + "(_versions\\/)?([_a-zA-Z.0-9]*)\\/"); + + // Get current version + var current_version = url.match(version_regex)[0] + current_version = current_version.substring(1, current_version.length - 1) + + // Add current version in case versions.json is unavailable + if (current_version != "libgpuarray" && current_version != "html") { + ver = current_version.replace("libgpuarray_versions/", "") + versions_dir[ver] = current_version + } + + function build_vswitch() { + // Build HTML string for version selector, based on ReadTheDocs theme's versions.html + + var vlabel = current_version.replace("libgpuarray_versions/", ""); + if (vlabel == 'theano') { + vlabel = 'release'; + } + var vswitch = ['
']; + vswitch.push(''); + vswitch.push(''); + vswitch.push('v: ', vlabel, ' '); + vswitch.push(''); + vswitch.push(''); + + vswitch.push('
'); + + vswitch.push('
'); + vswitch.push('
Versions
'); + for (var version in versions_dir) { + var new_url = url.replace(url.match(version_regex)[0], '/' + versions_dir[version] + '/'); + vswitch.push('
', version, '
'); + } + vswitch.push('
'); + + vswitch.push('
'); + vswitch.push('
Downloads
'); + var pdf_url = root_url + current_version + "/libgpuarray.pdf" + vswitch.push('
', 'PDF', '
'); + vswitch.push('
'); + + vswitch.push('
'); + vswitch.push('
On GitHub
'); + var git_master = "https://github.com/Theano/libgpuarray" + vswitch.push('
', 'Fork me', '
'); + vswitch.push('
'); + + vswitch.push('
'); + vswitch.push('
'); + return vswitch.join(''); + } + + function build_vswitch_up() { + // Build HTML string for version selector, based on ReadTheDocs theme's versions.html + + var vlabel = current_version.replace("libgpuarray_versions/", ""); + if (vlabel == 'libgpuarray') { + vlabel = 'release'; + } + var vswitch = ['
']; + vswitch.push(''); + vswitch.push(vlabel); + vswitch.push(''); + vswitch.push(''); + vswitch.push('
'); + return vswitch.join(''); + } + +// Create HTML for version switcher and assign to placeholder in layout.html. + $(document).ready(function() { + // Build default switcher + $('.version_switcher_placeholder').html(build_vswitch()); + $('.version_switcher_placeholder_up').html(build_vswitch_up()); + + // Check server for other doc versions and update switcher. + if (url.startsWith('http')) { + $.getJSON(root_url + 'libgpuarray_versions/versions.json', function(data){ + $.each(data, function(version, dir) { + versions_dir[version] = dir; + }); + $('.version_switcher_placeholder').html(build_vswitch()); + $('.version_switcher_placeholder_up').html(build_vswitch_up()); + }); + } + }); +})(); diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html new file mode 100644 index 0000000000..2f40797e03 --- /dev/null +++ b/doc/_templates/layout.html @@ -0,0 +1,39 @@ +{% extends "!layout.html" %} + +{% block footer %} +{{ super() }} + + + + + + +{% endblock %} diff --git a/doc/conf.py b/doc/conf.py index 0ff1a1b1a8..0a63642eb5 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -25,8 +25,16 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', - 'sphinx.ext.ifconfig', 'breathe'] +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.todo', + 'sphinx.ext.napoleon', +# 'sphinx.ext.linkcode', + 'breathe'] + +todo_include_todos = True +napoleon_google_docstring = False +napoleon_include_special_with_doc = False # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -42,7 +50,7 @@ # General information about the project. project = u'gpuarray' -copyright = u'2012, Arnaud Bergeron' +copyright = u'2012--2017, Arnaud Bergeron' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -61,21 +69,21 @@ # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_dirs = ['_build', 'scripts'] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. @@ -87,12 +95,20 @@ # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] - # -- Options for HTML output --------------------------------------------------- -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' + +if os.environ.get('READTHEDOCS') != 'True': + try: + import sphinx_rtd_theme + except ImportError: + pass + else: + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + html_theme = 'sphinx_rtd_theme' + +def setup(app): + app.add_stylesheet('fix_rtd.css') # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -125,11 +141,11 @@ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} @@ -168,6 +184,37 @@ htmlhelp_basename = 'gpuarraydoc' +# Options for the linkcode extension +# ---------------------------------- +# Resolve function +# This function is used to populate the (source) links in the API + +# XXX: This is broken for now since it doesn't work for cython modules +def linkcode_resolve(domain, info): + def find_source(): + obj = sys.modules[info['module']] + for part in info['fullname'].split('.'): + obj = getattr(obj, part) + import inspect + import os + fn = inspect.getsourcefile(obj) + fn = os.path.relpath(fn, start=os.path.dirname(pygpu.__file__)) + source, lineno = inspect.getsourcelines(obj) + return fn, lineno, lineno + len(source) - 1 + + if domain != 'py' or not info['module']: + return None + try: + filename = 'libgpuarray/pygpu/%s#L%d-L%d' % find_source() + except Exception: + filename = info['module'].replace('.', '/') + '.py' + import subprocess + tag = subprocess.Popen(['git', 'rev-parse', 'HEAD'], + stdout=subprocess.PIPE, + universal_newlines=True).communicate()[0][:-1] + return "https://github.com/Theano/libgpuarray/blob/%s/%s" % (tag, filename) + + # -- Options for LaTeX output -------------------------------------------------- latex_elements = { diff --git a/doc/installation.rst b/doc/installation.rst index aac4ff716c..6db72879c8 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -28,8 +28,8 @@ Run Requirements ---------------- No matter what was available at build time, this library comes with -dynamic loaders for the following library. You don't need to have any -of this available, but you won't be able to use associated +dynamic loaders for the following libraries. You don't need to have +any of this available, but you won't be able to use associated functionality. * For CUDA: @@ -137,7 +137,8 @@ due to a restriction of the new SIP feature about loading libraries. It appears that on some versions, /usr/local is not in the default compiler paths so you might need to add ``-L /usr/local/lib -I -/usr/local/include`` to the ``setup.py build`` command. +/usr/local/include`` to the ``setup.py build`` command or export the +paths like for a custom path install. Windows-specific instructions diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx index d41d4cebeb..0a6263ec09 100644 --- a/pygpu/collectives.pyx +++ b/pygpu/collectives.pyx @@ -16,7 +16,9 @@ from pygpu.gpuarray import GpuArrayException COMM_ID_BYTES = GA_COMM_ID_BYTES cdef class GpuCommCliqueId: - """Represents a unique id shared among :ref:`GpuComm` communicators which + """GpuCommCliqueId(context=None, comm_id=None) + + Represents a unique id shared among :ref:`GpuComm` communicators which participate in a multi-gpu clique. Parameters @@ -115,7 +117,9 @@ cdef class GpuCommCliqueId: cdef class GpuComm: - """Represents a communicator which participates in a multi-gpu clique. + """GpuComm(cid, ndev, rank) + + Represents a communicator which participates in a multi-gpu clique. It is used to invoke collective operations to gpus inside its clique. @@ -156,8 +160,11 @@ cdef class GpuComm: comm_get_rank(self, &gpurank) return gpurank - def reduce(self, GpuArray src not None, op, GpuArray dest=None, int root=-1): - """Reduce collective operation for ranks in a communicator world. + def reduce(self, GpuArray src not None, op, GpuArray dest=None, + int root=-1): + """reduce(self, src, op, dest=None, root=-1) + + Reduce collective operation for ranks in a communicator world. Parameters ---------- @@ -172,10 +179,11 @@ cdef class GpuComm: Notes ----- - * `root` is necessary when invoking from a non-root rank. Root caller - does not need to provide `root` argument. - * Not providing `dest` argument for a root caller will result in creating - a new compatible :ref:`GpuArray` and returning result in it. + * `root` is necessary when invoking from a non-root rank. Root + caller does not need to provide `root` argument. + * Not providing `dest` argument for a root caller will result + in creating a new compatible :ref:`GpuArray` and returning + result in it. """ cdef int srank @@ -193,7 +201,9 @@ cdef class GpuComm: comm_reduce(self, src, dest, to_reduce_opcode(op), root) def all_reduce(self, GpuArray src not None, op, GpuArray dest=None): - """AllReduce collective operation for ranks in a communicator world. + """all_reduce(self, src, op, dest=None) + + AllReduce collective operation for ranks in a communicator world. Parameters ---------- @@ -207,7 +217,7 @@ cdef class GpuComm: Notes ----- * Not providing `dest` argument for a caller will result in creating - a new compatible :ref:`GpuArray` and returning result in it. + a new compatible :ref:`GpuArray` and returning result in it. """ if dest is None: @@ -215,7 +225,9 @@ cdef class GpuComm: comm_all_reduce(self, src, dest, to_reduce_opcode(op)) def reduce_scatter(self, GpuArray src not None, op, GpuArray dest=None): - """ReduceScatter collective operation for ranks in a communicator world. + """reduce_scatter(self, src, op, dest=None) + + ReduceScatter collective operation for ranks in a communicator world. Parameters ---------- @@ -229,7 +241,7 @@ cdef class GpuComm: Notes ----- * Not providing `dest` argument for a caller will result in creating - a new compatible :ref:`GpuArray` and returning result in it. + a new compatible :ref:`GpuArray` and returning result in it. """ if dest is None: @@ -237,7 +249,9 @@ cdef class GpuComm: comm_reduce_scatter(self, src, dest, to_reduce_opcode(op)) def broadcast(self, GpuArray array not None, int root=-1): - """Broadcast collective operation for ranks in a communicator world. + """broadcast(self, array, root=-1) + + Broadcast collective operation for ranks in a communicator world. Parameters ---------- @@ -249,7 +263,7 @@ cdef class GpuComm: Notes ----- * `root` is necessary when invoking from a non-root rank. Root caller - does not need to provide `root` argument. + does not need to provide `root` argument. """ if root == -1: @@ -258,7 +272,9 @@ cdef class GpuComm: def all_gather(self, GpuArray src not None, GpuArray dest=None, unsigned int nd_up=1): - """AllGather collective operation for ranks in a communicator world. + """all_gather(self, src, dest=None, nd_up=1) + + AllGather collective operation for ranks in a communicator world. Parameters ---------- @@ -274,7 +290,7 @@ cdef class GpuComm: Notes ----- * Providing `nd_up` == 0 means that gathered arrays will be appended to - the dimension with the largest stride. + the dimension with the largest stride. """ if dest is None: From 3b6d472d357fa09935178d6302beca9b375e4ab6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Feb 2017 13:37:18 -0500 Subject: [PATCH 294/597] Fix up the last C documentation warts and stop generating doxygen html output. --- doc/Doxyfile | 2 +- src/gpuarray/buffer.h | 27 ++++---- src/gpuarray/buffer_collectives.h | 101 +++++++++++++++--------------- src/gpuarray/collectives.h | 23 ++++--- 4 files changed, 78 insertions(+), 75 deletions(-) diff --git a/doc/Doxyfile b/doc/Doxyfile index f257e90bf0..2de3d2331a 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -865,7 +865,7 @@ IGNORE_PREFIX = # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. -GENERATE_HTML = YES +GENERATE_HTML = NO # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index df00096cd1..be314b3e8e 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -44,23 +44,28 @@ struct _gpukernel; typedef struct _gpukernel gpukernel; /** - * \brief Gets information about the number of available platforms for the + * Gets information about the number of available platforms for the * backend specified in `name`. - * \param name [const char*] the backend name - * \param platcount [unsigned int*] will contain number of compatible platforms in host - * \return int GA_NO_ERROR, if success + * + * \param name the backend name + * \param platcount will contain number of compatible + * platforms in host + * + * \return #GA_NO_ERROR, if success */ GPUARRAY_PUBLIC int gpu_get_platform_count(const char* name, unsigned int* platcount); /** - * \brief Gets information about the number of compatible devices on a specific - * host's `platform` for the backend specified in `name`. - * \param name [const char*] the backend name - * \param platform [unsigned int] number for a platform in host - * \param devcount [unsigned int*] will contain number of compatible devices in - * `platform` - * \return int GA_NO_ERROR, if success + * Gets information about the number of compatible devices on a + * specific host's `platform` for the backend specified in `name`. + * + * \param name the backend name + * \param platform number for a platform in host + * \param devcount will contain number of compatible devices in + * `platform` + * + * \return #GA_NO_ERROR, if success */ GPUARRAY_PUBLIC int gpu_get_device_count(const char* name, unsigned int platform, diff --git a/src/gpuarray/buffer_collectives.h b/src/gpuarray/buffer_collectives.h index ff825a3209..f5dfef35d8 100644 --- a/src/gpuarray/buffer_collectives.h +++ b/src/gpuarray/buffer_collectives.h @@ -22,10 +22,12 @@ struct _gpucomm; typedef struct _gpucomm gpucomm; -/** - * Enum for reduce ops of gpucomm +/* + * \enum _gpucomm_reduce_ops + * + * \brief Reduction operations */ -enum _gpucomm_reduce_ops { +enum gpucomm_reduce_ops { GA_SUM = 0, //!< to sum (elemwise) arrays across ranks GA_PROD = 1, //!< to multiply (elemwise) arrays across ranks GA_MAX = 2, //!< to find max (elemwise) of arrays across ranks @@ -42,23 +44,21 @@ typedef struct _gpucommCliqueId { } gpucommCliqueId; /** - * \brief Create a new gpu communicator instance. + * Create a new gpu communicator instance. + * + * This must be called in parallel by all participants in the same + * world. The call will block until all participants have joined in. + * The world is defined by a shared comm_id. * * \param comm pointer to get a new gpu communicator - * \param ctx gpu context in which `comm` will be used (contains - * device information) + * \param ctx gpu context in which `comm` will be used + * (contains device information) * \param comm_id id unique to communicators consisting a world * \param ndev number of communicators/devices participating in the world - * \param rank user-defined rank, from 0 to `ndev`-1, of `comm` in the - * world - * - * \note `rank` is defined to be unique for each new `comm` - * participating in the same world. + * \param rank user-defined rank, from 0 to `ndev`-1. Must be unique + * for the world. * - * \note Must be called in parallel by all separate new `comm`, which - * will consist a new world (failing will lead to deadlock). - * - * \return int error code, \ref GA_NO_ERROR if success + * \returns error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, int ndev, int rank); @@ -75,7 +75,7 @@ GPUARRAY_PUBLIC void gpucomm_free(gpucomm* comm); * * \param ctx gpu context in which communicator was used * - * \return const char* useful backend error message + * \returns useful backend error message */ GPUARRAY_PUBLIC const char* gpucomm_error(gpucontext* ctx); @@ -84,42 +84,41 @@ GPUARRAY_PUBLIC const char* gpucomm_error(gpucontext* ctx); * * \param comm gpu communicator * - * \return gpucontext* gpu context + * \returns gpu context */ GPUARRAY_PUBLIC gpucontext* gpucomm_context(gpucomm* comm); /** - * Creates a unique `comm_id` to be shared in a world of communicators. + * Creates a unique `comm_id`. + * + * The id is guarenteed to be unique in the same host, but not + * necessarily across hosts. * * \param ctx gpu context * \param comm_id pointer to instance containing id * - * \note Id is guaranteed to be unique across callers in a single host. - * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_gen_clique_id(gpucontext* ctx, gpucommCliqueId* comm_id); /** - * Returns total number of device/communicators participating in - * `comm`'s world. + * Returns total number of devices participating in `comm`'s world. * * \param comm gpu communicator - * \param gpucount pointer to number of gpus in `comm`'s world + * \param devcount pointer to store the number of devices * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ -GPUARRAY_PUBLIC int gpucomm_get_count(gpucomm* comm, int* gpucount); +GPUARRAY_PUBLIC int gpucomm_get_count(gpucomm* comm, int* devcount); /** - * Returns rank of `comm` inside its world as defined by user upon - * creation. + * Returns the rank of `comm` inside its world. * * \param comm gpu communicator - * \param rank pointer to `comm`'s rank + * \param rank pointer to store the rank * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_get_rank(gpucomm* comm, int* rank); @@ -129,22 +128,22 @@ GPUARRAY_PUBLIC int gpucomm_get_rank(gpucomm* comm, int* rank); * * \param src data in device's buffer to be reduced * \param offsrc memory offset after which data is saved in buffer - * `src` + * `src` * \param dest data in device's buffer to collect result * \param offdest memory offset after which data will be saved in - * buffer `dest` + * buffer `dest` * \param count number of elements to be reduced in each array - * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES - * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param typecode elements' data type + * \param opcode reduce operation code * \param root rank in `comm` which will collect result * \param comm gpu communicator * * \note Non root ranks can call this, using a NULL `dest`. In this - * case, `offdest` will not be used. + * case, `offdest` will not be used. * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, size_t count, int typecode, @@ -160,18 +159,18 @@ GPUARRAY_PUBLIC int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, * * \param src data in device's buffer to be reduced * \param offsrc memory offset after which data is saved in buffer - * `src` + * `src` * \param dest data in device's buffer to collect result * \param offdest memory offset after which data will be saved in - * buffer `dest` + * buffer `dest` * \param count number of elements to be reduced in each array - * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES - * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param typecode elements' data type + * \param opcode reduce operation code (see #gpucomm_reduce_ops) * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, @@ -188,18 +187,18 @@ GPUARRAY_PUBLIC int gpucomm_all_reduce(gpudata* src, size_t offsrc, * * \param src data in device's buffer to be reduced * \param offsrc memory offset after which data is saved in buffer - * `src` + * `src` * \param dest data in device's buffer to collect scattered result * \param offdest memory offset after which data will be saved in - * buffer `dest` + * buffer `dest` * \param count number of elements to be contained in result `dest` - * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES - * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param typecode elements' data type + * \param opcode reduce operation code (see #gpucomm_reduce_ops) * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, @@ -215,13 +214,13 @@ GPUARRAY_PUBLIC int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, * \param array data in device's buffer to get copied or be received * \param offset memory offset after which data in `array` begin * \param count number of elements to be contained in `array` - * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES + * \param typecode elements' data type * \param root rank in `comm` which broadcasts its array * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_broadcast(gpudata* array, size_t offset, size_t count, int typecode, int root, @@ -238,13 +237,13 @@ GPUARRAY_PUBLIC int gpucomm_broadcast(gpudata* array, size_t offset, * \param dest data in device's buffer to gather from all ranks * \param offdest memory offset after which data in `dest` begin * \param count number of elements to be gathered from each rank in - * `src` - * \param typecode code for elements' data type, see \ref GPUARRAY_TYPES + * `src` + * \param typecode elements' data type * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int gpucomm_all_gather(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, diff --git a/src/gpuarray/collectives.h b/src/gpuarray/collectives.h index fcc7e65919..e1b776b68f 100644 --- a/src/gpuarray/collectives.h +++ b/src/gpuarray/collectives.h @@ -21,14 +21,14 @@ extern "C" { * communicator world. * * \param src array to be reduced - * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param root rank in `comm` which will collect result * \param comm gpu communicator * * \note Root rank of reduce operation must call GpuArray_reduce(). * \note Must be called separately for each rank in `comm`, except root rank. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce_from(const GpuArray* src, int opcode, int root, gpucomm* comm); @@ -38,18 +38,17 @@ GPUARRAY_PUBLIC int GpuArray_reduce_from(const GpuArray* src, int opcode, * * \param src array to be reduced * \param dest array to collect reduce operation result - * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param root rank in `comm` which will collect result * \param comm gpu communicator * * \note Can be used by root and non root ranks alike. * * \note Non root ranks can call this, using a NULL `dest`. - * * \note Must be called separately for each rank in `comm` (non root - * can call GpuArray_reduce_from() instead). + * can call GpuArray_reduce_from() instead). * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce(const GpuArray* src, GpuArray* dest, int opcode, int root, gpucomm* comm); @@ -62,12 +61,12 @@ GPUARRAY_PUBLIC int GpuArray_reduce(const GpuArray* src, GpuArray* dest, * * \param src array to be reduced * \param dest array to collect reduce operation result - * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_all_reduce(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm); @@ -81,12 +80,12 @@ GPUARRAY_PUBLIC int GpuArray_all_reduce(const GpuArray* src, GpuArray* dest, * * \param src array to be reduced * \param dest array to collect reduce operation scattered result - * \param opcode reduce operation code, see \ref _gpucomm_reduce_ops + * \param opcode reduce operation code, see #gpucomm_reduce_ops * \param comm gpu communicator * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest, int opcode, gpucomm* comm); @@ -102,7 +101,7 @@ GPUARRAY_PUBLIC int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest, * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_broadcast(GpuArray* array, int root, gpucomm* comm); @@ -120,7 +119,7 @@ GPUARRAY_PUBLIC int GpuArray_broadcast(GpuArray* array, int root, * * \note Must be called separately for each rank in `comm`. * - * \return int error code, \ref GA_NO_ERROR if success + * \return error code or #GA_NO_ERROR if success */ GPUARRAY_PUBLIC int GpuArray_all_gather(const GpuArray* src, GpuArray* dest, gpucomm* comm); From 1edddabf3001bf1822896dacb6a8801fa0cc2917 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Feb 2017 13:45:40 -0500 Subject: [PATCH 295/597] Add function signatures for blas.pyx. --- pygpu/blas.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index 14d90c0f76..8965f43420 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -54,6 +54,8 @@ cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False): + """dot(X, Y, Z=None, overwrite_z=False) + """ if Z is None: Z = pygpu_empty(0, NULL, X.typecode, GA_ANY_ORDER, X.context, None) overwrite_z = True @@ -65,6 +67,8 @@ def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False): def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, GpuArray Y=None, trans_a=False, overwrite_y=False): + """gemv(alpha, A, X, beta=0.0, Y=None, trans_a=False, overwrite_y=False) + """ cdef cb_transpose transA cdef size_t Yshp @@ -93,6 +97,8 @@ def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, trans_a=False, trans_b=False, overwrite_c=False): + """gemm(alpha, A, B, beta, C=None, trans_a=False, trans_b=False, overwrite_c=False) + """ cdef cb_transpose transA cdef cb_transpose transB cdef size_t[2] Cshp @@ -132,6 +138,8 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, def ger(double alpha, GpuArray X, GpuArray Y, GpuArray A=None, overwrite_a=False): + """ger(alpha, X, Y, A=None, overwrite_a=False) + """ cdef size_t[2] Ashp if A is None: From b31c87f553549f961c81e096eca6ac880bec0948 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Feb 2017 14:39:55 -0500 Subject: [PATCH 296/597] Don't show base classes in the docs. --- doc/pyapi/pygpu.rst | 9 --------- 1 file changed, 9 deletions(-) diff --git a/doc/pyapi/pygpu.rst b/doc/pyapi/pygpu.rst index 5097e8ea3e..2e30064900 100644 --- a/doc/pyapi/pygpu.rst +++ b/doc/pyapi/pygpu.rst @@ -7,7 +7,6 @@ pygpu.gpuarray module .. automodule:: pygpu.gpuarray :members: :undoc-members: - :show-inheritance: pygpu.elemwise module --------------------- @@ -15,7 +14,6 @@ pygpu.elemwise module .. automodule:: pygpu.elemwise :members: :undoc-members: - :show-inheritance: pygpu.operations module ----------------------- @@ -23,7 +21,6 @@ pygpu.operations module .. automodule:: pygpu.operations :members: :undoc-members: - :show-inheritance: pygpu.reduction module ---------------------- @@ -31,7 +28,6 @@ pygpu.reduction module .. automodule:: pygpu.reduction :members: :undoc-members: - :show-inheritance: pygpu.blas module ----------------- @@ -39,7 +35,6 @@ pygpu.blas module .. automodule:: pygpu.blas :members: :undoc-members: - :show-inheritance: pygpu.collectives module ------------------------ @@ -47,7 +42,6 @@ pygpu.collectives module .. automodule:: pygpu.collectives :members: :undoc-members: - :show-inheritance: pygpu.dtypes module ------------------- @@ -55,7 +49,6 @@ pygpu.dtypes module .. automodule:: pygpu.dtypes :members: :undoc-members: - :show-inheritance: pygpu.tools module ------------------ @@ -63,7 +56,6 @@ pygpu.tools module .. automodule:: pygpu.tools :members: :undoc-members: - :show-inheritance: Module contents --------------- @@ -71,4 +63,3 @@ Module contents .. automodule:: pygpu :members: :undoc-members: - :show-inheritance: From f4949a98a0713a430044807c1fa465e0ba5ced80 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Feb 2017 15:54:32 -0500 Subject: [PATCH 297/597] Fix references in argument types. --- pygpu/collectives.pyx | 60 +++++++++++++------------ pygpu/dtypes.py | 38 ++++------------ pygpu/gpuarray.pyx | 100 ++++++++++++++++++++++++++++++------------ pygpu/tools.py | 2 +- 4 files changed, 114 insertions(+), 86 deletions(-) diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx index 0a6263ec09..302c99cd79 100644 --- a/pygpu/collectives.pyx +++ b/pygpu/collectives.pyx @@ -23,9 +23,9 @@ cdef class GpuCommCliqueId: Parameters ---------- - context: :ref:`GpuContext`, optional - Reference to which gpu this `GpuCommCliqueId` object belongs. - comm_id: bytes-like, optional + context: GpuContext + Reference to which gpu this GpuCommCliqueId object belongs. + comm_id: bytes Existing unique id to be passed in this object. """ @@ -125,13 +125,13 @@ cdef class GpuComm: Parameters ---------- - cid: :ref:`GpuCommCliqueId` + cid: GpuCommCliqueId Unique id shared among participating communicators. ndev: int Number of communicators inside the clique. rank: int - User-defined rank of this communicator inside the clique. It influences - order of collective operations. + User-defined rank of this communicator inside the clique. It + influences order of collective operations. """ def __dealloc__(self): @@ -162,17 +162,18 @@ cdef class GpuComm: def reduce(self, GpuArray src not None, op, GpuArray dest=None, int root=-1): - """reduce(self, src, op, dest=None, root=-1) + """ + reduce(self, src, op, dest=None, root=-1) Reduce collective operation for ranks in a communicator world. Parameters ---------- - src: :ref:`GpuArray` + src: GpuArray Array to be reduced. - op: string + op: str Key indicating operation type. - dest: :ref:`GpuArray`, optional + dest: GpuArray Array to collecti reduce operation result. root: int Rank in `GpuComm` which will collect result. @@ -201,17 +202,18 @@ cdef class GpuComm: comm_reduce(self, src, dest, to_reduce_opcode(op), root) def all_reduce(self, GpuArray src not None, op, GpuArray dest=None): - """all_reduce(self, src, op, dest=None) + """ + all_reduce(self, src, op, dest=None) AllReduce collective operation for ranks in a communicator world. Parameters ---------- - src: :ref:`GpuArray` + src: GpuArray Array to be reduced. - op: string + op: str Key indicating operation type. - dest: :ref:`GpuArray`, optional + dest: GpuArray Array to collect reduce operation result. Notes @@ -225,17 +227,18 @@ cdef class GpuComm: comm_all_reduce(self, src, dest, to_reduce_opcode(op)) def reduce_scatter(self, GpuArray src not None, op, GpuArray dest=None): - """reduce_scatter(self, src, op, dest=None) + """ + reduce_scatter(self, src, op, dest=None) ReduceScatter collective operation for ranks in a communicator world. Parameters ---------- - src: :ref:`GpuArray` + src: GpuArray Array to be reduced. - op: string + op: str Key indicating operation type. - dest: :ref:`GpuArray`, optional + dest: GpuArray Array to collect reduce operation scattered result. Notes @@ -249,13 +252,14 @@ cdef class GpuComm: comm_reduce_scatter(self, src, dest, to_reduce_opcode(op)) def broadcast(self, GpuArray array not None, int root=-1): - """broadcast(self, array, root=-1) + """ + broadcast(self, array, root=-1) Broadcast collective operation for ranks in a communicator world. Parameters ---------- - array: :ref:`GpuArray` + array: GpuArray Array to be reduced. root: int Rank in `GpuComm` which broadcasts its `array`. @@ -272,20 +276,22 @@ cdef class GpuComm: def all_gather(self, GpuArray src not None, GpuArray dest=None, unsigned int nd_up=1): - """all_gather(self, src, dest=None, nd_up=1) + """ + all_gather(self, src, dest=None, nd_up=1) AllGather collective operation for ranks in a communicator world. Parameters ---------- - src: :ref:`GpuArray` + src: GpuArray Array to be gathered. - dest: :ref:`GpuArray`, optional + dest: GpuArray Array to receive all gathered arrays from ranks in `GpuComm`. - nd_up: unsigned int - Used when creating result array. Indicates how many extra dimensions - user wants result to have. Default is 1, which means that the result - will store each rank's gathered array in one extra new dimension. + nd_up: int + Used when creating result array. Indicates how many extra + dimensions user wants result to have. Default is 1, which + means that the result will store each rank's gathered + array in one extra new dimension. Notes ----- diff --git a/pygpu/dtypes.py b/pygpu/dtypes.py index 1acfdad959..f10cfa5606 100644 --- a/pygpu/dtypes.py +++ b/pygpu/dtypes.py @@ -67,7 +67,7 @@ def register_dtype(dtype, c_names): NAME_TO_DTYPE[nm] = dtype -def _fill_dtype_registry(respect_windows): +def _fill_dtype_registry(): from sys import platform register_dtype(np.bool, ["ga_bool", "bool"]) @@ -80,28 +80,8 @@ def _fill_dtype_registry(respect_windows): register_dtype(np.int64, ["ga_long"]) register_dtype(np.uint64, ["ga_ulong"]) - is_64_bit = tuple.__itemsize__ * 8 == 64 - if is_64_bit: - if 'win32' in platform and respect_windows: - i64_name = "long long" - else: - i64_name = "long" - register_dtype(np.int64, [i64_name, "%s int" % i64_name, - "signed %s int" % i64_name, - "%s signed int" % i64_name]) - register_dtype(np.uint64, ["unsigned %s" % i64_name, - "unsigned %s int" % i64_name, - "%s unsigned int" % i64_name]) - - # According to this uintp may not have the same hash as uint32: - # http://projects.scipy.org/numpy/ticket/2017 - # Failing tests tell me this is the case for intp too. - if is_64_bit: - register_dtype(np.intp, ["ga_long"]) - register_dtype(np.uintp, ["ga_ulong"]) - else: - register_dtype(np.intp, ["ga_int"]) - register_dtype(np.uintp, ["ga_uint"]) + register_dtype(np.intp, ["ga_ssize"]) + register_dtype(np.uintp, ["ga_size"]) register_dtype(np.float32, ["ga_float", "float"]) register_dtype(np.float64, ["ga_double", "double"]) @@ -111,21 +91,19 @@ def _fill_dtype_registry(respect_windows): # {{{ dtype -> ctype -def dtype_to_ctype(dtype, with_fp_tex_hack=False): +def dtype_to_ctype(dtype): """ Return the C type that corresponds to `dtype`. - :param dtype: a numpy dtype + Parameters + ---------- + dtype: data type + a numpy dtype """ if dtype is None: raise ValueError("dtype may not be None") dtype = np.dtype(dtype) - if with_fp_tex_hack: - if dtype == np.float32: - return "fp_tex_float" - elif dtype == np.float64: - return "fp_tex_double" return gpuarray.dtype_to_ctype(dtype) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 8190016d12..04cb32dabc 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -10,10 +10,14 @@ from cpython cimport Py_INCREF, PyNumber_Index from cpython.object cimport Py_EQ, Py_NE def api_version(): + """api_version() + """ # (library version, module version) return (GPUARRAY_API_VERSION, 0) def abi_version(): + """abi_version() + """ major_version = GPUARRAY_ABI_VERSION / 1000 minor_version = GPUARRAY_ABI_VERSION % 1000 return (major_version, minor_version) @@ -526,7 +530,10 @@ cdef bint pygpu_GpuArray_Check(object o): return isinstance(o, GpuArray) def count_platforms(kind): - """Return number of host's platforms compatible with `kind`. + """ + count_platforms(kind) + + Return number of host's platforms compatible with `kind`. """ cdef unsigned int platcount cdef int err @@ -536,7 +543,10 @@ def count_platforms(kind): return platcount def count_devices(kind, unsigned int platform): - """Returns number of devices in host's `platform` compatible with `kind`. + """ + count_devices(kind, platform) + + Returns number of devices in host's `platform` compatible with `kind`. """ cdef unsigned int devcount cdef int err @@ -886,22 +896,6 @@ def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0, Create a GpuArray from existing data - :param obj: data to initialize the result - :type obj: array-like - :param dtype: data type of the result elements - :type dtype: string or numpy.dtype or int - :param copy: return a copy? - :type copy: bool - :param order: memory layout of the result - :type order: string - :param ndmin: minimum number of result dimensions - :type ndmin: unsigned int - :param context: allocation context - :type context: GpuContext - :param cls: result class (must inherit from GpuArray) - :type cls: class - :rtype: GpuArray - This function creates a new GpuArray from the data provided in `obj` except if `obj` is already a GpuArray and all the parameters match its properties and `copy` is False. @@ -911,6 +905,29 @@ def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0, This function is similar to :meth:`numpy.array` except that it returns GpuArrays. + + Parameters + ---------- + obj: array-like + data to initialize the result + dtype: string or numpy.dtype or int + data type of the result elements + copy: bool + return a copy? + order: str + memory layout of the result + ndmin: int + minimum number of result dimensions + context: GpuContext + allocation context + cls: class + result class (must inherit from GpuArray) + + Returns + ------- + GpuArray + new array + """ return carray(proto, dtype, copy, order, ndmin, context, cls) @@ -979,11 +996,9 @@ cuda_exit = gpuarray_get_extension("cuda_exit") cdef class GpuContext: """ - Class that holds all the information pertaining to a context. - - .. code-block:: python + GpuContext(kind, devno, flags) - GpuContext(kind, devno, flags) + Class that holds all the information pertaining to a context. :param kind: module name for the context :type kind: string @@ -1428,6 +1443,9 @@ cdef int pygpu_transfer(GpuArray res, GpuArray a) except -1: return 0 def _split(GpuArray a, ind, unsigned int axis): + """ + _split(a, ind, axis) + """ cdef list r = [None] * (len(ind) + 1) cdef Py_ssize_t i if not axis < a.ga.nd: @@ -1464,6 +1482,9 @@ cdef GpuArray pygpu_concatenate(const _GpuArray **a, size_t n, def _concatenate(list al, unsigned int axis, int restype, object cls, GpuContext context): + """ + _concatenate(al, axis, restype, cls, context) + """ cdef Py_ssize_t i context = ensure_context(context) cdef const _GpuArray **als = PyMem_Malloc(sizeof(_GpuArray *) * len(al)) @@ -1486,6 +1507,8 @@ cuda_open_ipc_handle = Date: Wed, 22 Feb 2017 16:01:04 -0500 Subject: [PATCH 298/597] Fix references to classes. --- pygpu/collectives.pyx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx index 302c99cd79..39603e85d2 100644 --- a/pygpu/collectives.pyx +++ b/pygpu/collectives.pyx @@ -18,7 +18,7 @@ COMM_ID_BYTES = GA_COMM_ID_BYTES cdef class GpuCommCliqueId: """GpuCommCliqueId(context=None, comm_id=None) - Represents a unique id shared among :ref:`GpuComm` communicators which + Represents a unique id shared among :class:`GpuComm` communicators which participate in a multi-gpu clique. Parameters @@ -102,7 +102,7 @@ cdef class GpuCommCliqueId: raise RuntimeError, "Cannot pickle %s object" % self.__class__.__name__ property comm_id: - "Unique clique id to be used by each :ref:`GpuComm` in a group of devices" + "Unique clique id to be used by each :class:`GpuComm` in a group of devices" def __get__(self): cdef bytearray res res = self.c_comm_id.internal[:GA_COMM_ID_BYTES] @@ -174,16 +174,16 @@ cdef class GpuComm: op: str Key indicating operation type. dest: GpuArray - Array to collecti reduce operation result. + Array to collect reduce operation result. root: int - Rank in `GpuComm` which will collect result. + Rank in GpuComm which will collect result. Notes ----- * `root` is necessary when invoking from a non-root rank. Root caller does not need to provide `root` argument. * Not providing `dest` argument for a root caller will result - in creating a new compatible :ref:`GpuArray` and returning + in creating a new compatible :class:`GpuArray` and returning result in it. """ @@ -219,7 +219,7 @@ cdef class GpuComm: Notes ----- * Not providing `dest` argument for a caller will result in creating - a new compatible :ref:`GpuArray` and returning result in it. + a new compatible :class:`GpuArray` and returning result in it. """ if dest is None: @@ -244,7 +244,7 @@ cdef class GpuComm: Notes ----- * Not providing `dest` argument for a caller will result in creating - a new compatible :ref:`GpuArray` and returning result in it. + a new compatible :class:`GpuArray` and returning result in it. """ if dest is None: From 1728db3cb6847a36cfce038e44144e367623f0b8 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Feb 2017 16:12:59 -0500 Subject: [PATCH 299/597] Actually follow the removed false branch. --- pygpu/dtypes.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pygpu/dtypes.py b/pygpu/dtypes.py index f10cfa5606..7415f00bb4 100644 --- a/pygpu/dtypes.py +++ b/pygpu/dtypes.py @@ -78,10 +78,11 @@ def _fill_dtype_registry(): register_dtype(np.int32, ["ga_int", "int", "signed int"]) register_dtype(np.uint32, ["ga_uint", "unsigned", "unsigned int"]) - register_dtype(np.int64, ["ga_long"]) - register_dtype(np.uint64, ["ga_ulong"]) - register_dtype(np.intp, ["ga_ssize"]) - register_dtype(np.uintp, ["ga_size"]) + register_dtype(np.int64, ["ga_long", "long int", "signed long int", "long signed int"]) + register_dtype(np.uint64, ["ga_ulong", "unsigned long", "unsigned long int", "long unsigned int"]) + + register_dtype(np.intp, ["ga_ssize", "ssize_t"]) + register_dtype(np.uintp, ["ga_size", "size_t"]) register_dtype(np.float32, ["ga_float", "float"]) register_dtype(np.float64, ["ga_double", "double"]) From a369aad7d0f17028b1a25b7975447c6f86225a64 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Feb 2017 16:53:34 -0500 Subject: [PATCH 300/597] Enforce numpydoc everwhere. --- pygpu/dtypes.py | 13 +- pygpu/gpuarray.pyx | 386 +++++++++++++++++++++++++------------------- pygpu/reduction.py | 5 - pygpu/tests/main.py | 57 +++---- 4 files changed, 261 insertions(+), 200 deletions(-) diff --git a/pygpu/dtypes.py b/pygpu/dtypes.py index 7415f00bb4..7498a3a9c7 100644 --- a/pygpu/dtypes.py +++ b/pygpu/dtypes.py @@ -39,16 +39,19 @@ def register_dtype(dtype, c_names): """ Associate a numpy dtype with its C equivalents. - :param dtype: type to associate - :type dtype: numpy.dtype or string - :param c_names: list of C type names - :type c_names: str or list - Will register `dtype` for use with the gpuarray module. If the c_names argument is a list then the first element of that list is taken as the primary association and will be used for generated C code. The other types will be mapped to the provided dtype when going in the other direction. + + Parameters + ---------- + dtype: numpy.dtype or string + type to associate + c_names: str or list + list of C type names + """ if isinstance(c_names, str): c_names = [c_names] diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 04cb32dabc..cd5310450e 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -115,11 +115,13 @@ def register_dtype(np.dtype dtype, cname): This function return the associted internal typecode for the new type. - :param dtype: new type - :type dtype: numpy.dtype - :param cname: C name for the type declarations - :type cname: string - :rtype: int + Parameters + ---------- + dtype: numpy.dtype + new type + cname: str + C name for the type declarations + """ cdef gpuarray_type *t cdef int typecode @@ -177,9 +179,11 @@ cpdef int dtype_to_typecode(dtype) except -1: Get the internal typecode for a type. - :param dtype: type to get the code for - :type dtype: numpy.dtype - :rtype: int + Parameters + ---------- + dtype: numpy.dtype + type to get the code for + """ if isinstance(dtype, int): return dtype @@ -199,9 +203,11 @@ def dtype_to_ctype(dtype): Return the C name for a type. - :param dtype: type to get the name for - :type dtype: numpy.dtype - :rtype: string + Parameters + ---------- + dtype: numpy.dtype + type to get the name for + """ cdef int typecode = dtype_to_typecode(dtype) cdef const gpuarray_type *t = gpuarray_get_type(typecode) @@ -492,10 +498,6 @@ def set_default_context(GpuContext ctx): Set the default context for the module. - :param ctx: default context - :type ctx: GpuContext - :rtype: None - The provided context will be used as a default value for all the other functions in this module which take a context as parameter. Call with `None` to clear the default value. @@ -506,6 +508,12 @@ def set_default_context(GpuContext ctx): This can be helpful to reduce clutter when working with only one context. It is strongly discouraged to use this function when working with multiple contexts at once. + + Parameters + ---------- + ctx: GpuContext + default context + """ global default_context default_context = ctx @@ -581,16 +589,6 @@ def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): Creates a context from a device specifier. - :param dev: device specifier - :type dev: string - :param sched: optimize scheduling for which type of operation - :type sched: {'default', 'single', 'multi'} - :param disable_alloc_cache: disable allocation cache (if any) - :type disable_alloc_cache: bool - :param single_stream: enable single stream mode - :type single_stream: bool - :rtype: GpuContext - Device specifiers are composed of the type string and the device id like so:: @@ -609,6 +607,18 @@ def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): list available platforms and devices. You can experiement with the values, unavaiable ones will just raise an error, and there are no gaps in the valid numbers. + + Parameters + ---------- + dev: str + device specifier + sched: {'default', 'single', 'multi'} + optimize scheduling for which type of operation + disable_alloc_cache: bool + disable allocation cache (if any) + single_stream: bool + enable single stream mode + """ cdef int flags = 0 if sched == 'single': @@ -631,17 +641,19 @@ def zeros(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None, Returns an array of zero-initialized values of the requested shape, type and order. - :param shape: number of elements in each dimension - :type shape: iterable of ints - :param dtype: type of the elements - :type dtype: string, numpy.dtype or int - :param order: layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran - :type order: string - :param context: context in which to do the allocation - :type context: GpuContext - :param cls: class of the returned array (must inherit from GpuArray) - :type cls: class - :rtype: array + Parameters + ---------- + shape: iterable of ints + number of elements in each dimension + dtype: str, numpy.dtype or int + type of the elements + order: {'A', 'C', 'F'} + layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran + context: GpuContext + context in which to do the allocation + cls: type + class of the returned array (must inherit from GpuArray) + """ res = empty(shape, dtype=dtype, order=order, context=context, cls=cls) array_memset(res, 0) @@ -705,17 +717,19 @@ def empty(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None, Returns an empty (uninitialized) array of the requested shape, type and order. - :param shape: number of elements in each dimension - :type shape: iterable of ints - :param dtype: type of the elements - :type dtype: string, numpy.dtype or int - :param order: layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran - :type order: string - :param context: context in which to do the allocation - :type context: GpuContext - :param cls: class of the returned array (must inherit from GpuArray) - :type cls: class - :rtype: array + Parameters + ---------- + shape: iterable of ints + number of elements in each dimension + dtype: str, numpy.dtype or int + type of the elements + order: {'A', 'C', 'F'} + layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran + context: GpuContext + context in which to do the allocation + cls: type + class of the returned array (must inherit from GpuArray) + """ cdef size_t *cdims cdef unsigned int nd @@ -744,16 +758,6 @@ def asarray(a, dtype=None, order='A', GpuContext context=None): Returns a GpuArray from the data in `a` - :param a: data - :type shape: array-like - :param dtype: type of the elements - :type dtype: string, numpy.dtype or int - :param order: layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran - :type order: string or int - :param context: context in which to do the allocation - :type context: GpuContext - :rtype: GpuArray - If `a` is already a GpuArray and all other parameters match, then the object itself returned. If `a` is an instance of a subclass of GpuArray then a view of the base class will be returned. @@ -761,6 +765,18 @@ def asarray(a, dtype=None, order='A', GpuContext context=None): `context` is optional if `a` is a GpuArray (but must match exactly the context of `a` if specified) and is mandatory otherwise. + + Parameters + ---------- + a: array-like + data + dtype: str, numpy.dtype or int + type of the elements + order: {'A', 'C', 'F'} + layout of the data in memory, one of 'A'ny, 'C' or 'F'ortran + context: GpuContext + context in which to do the allocation + """ return array(a, dtype=dtype, order=order, copy=False, context=context, cls=GpuArray) @@ -771,16 +787,18 @@ def ascontiguousarray(a, dtype=None, GpuContext context=None): Returns a contiguous array in device memory (C order). - :param a: input - :type a: array-like - :param dtype: type of the return array - :type dtype: string, numpy.dtype or int - :param context: context to use for a new array - :type context: GpuContext - :rtype: array - `context` is optional if `a` is a GpuArray (but must match exactly the context of `a` if specified) and is mandatory otherwise. + + Parameters + ---------- + a: array-like + input + dtype: str, numpy.dtype or int + type of the return array + context: GpuContext + context to use for a new array + """ return array(a, order='C', dtype=dtype, ndmin=1, copy=False, context=context) @@ -791,16 +809,18 @@ def asfortranarray(a, dtype=None, GpuArray context=None): Returns a contiguous array in device memory (Fortran order) - :param a: input - :type a: array-like - :param dtype: type of the elements - :type dtype: string, numpy.dtype or int - :param context: context in which to do the allocation - :type context: GpuContext - :rtype: array - `context` is optional if `a` is a GpuArray (but must match exactly the context of `a` if specified) and is mandatory otherwise. + + Parameters + ---------- + a: array-like + input + dtype: str, numpy.dtype or int + type of the elements + context: GpuContext + context in which to do the allocation + """ return array(a, order='F', dtype=dtype, ndmin=1, copy=False, context=context) @@ -820,33 +840,39 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None, Build a GpuArray from pre-allocated gpudata - :param data: pointer to a gpudata structure - :type data: int - :param offset: offset to the data location inside the gpudata - :type offset: int - :param dtype: data type of the gpudata elements - :type dtype: numpy.dtype - :param shape: shape to use for the result - :type shape: iterable of ints - :param context: context of the gpudata - :type context: GpuContext - :param strides: strides for the results (C contiguous if not specified) - :type strides: iterable of ints - :param writable: is the data writable? - :type writeable: bool - :param base: base object that keeps gpudata alive - :param cls: view type of the result + Parameters + ---------- + data: int + pointer to a gpudata structure + offset: int + offset to the data location inside the gpudata + dtype: numpy.dtype + data type of the gpudata elements + shape: iterable of ints + shape to use for the result + context: GpuContext + context of the gpudata + strides: iterable of ints + strides for the results (C contiguous if not specified) + writable: bool + is the data writable? + base: object + base object that keeps gpudata alive + cls: type + view type of the result + + Notes + ----- + This function might be deprecated in a later relase since the only + way to create gpudata pointers is through libgpuarray functions + that aren't exposed at the python level. It can be used with the + value of the `gpudata` attribute of an existing GpuArray. .. warning:: This function is intended for advanced use and will crash the interpreter if used improperly. - .. note:: - This function might be deprecated in a later relase since the - only way to create gpudata pointers is through libgpuarray - functions that aren't exposed at the python level. It can be - used with the value of the `gpudata` attribute of an existing - GpuArray. + """ cdef size_t *cdims = NULL cdef ssize_t *cstrides = NULL @@ -920,14 +946,9 @@ def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0, minimum number of result dimensions context: GpuContext allocation context - cls: class + cls: type result class (must inherit from GpuArray) - Returns - ------- - GpuArray - new array - """ return carray(proto, dtype, copy, order, ndmin, context, cls) @@ -1000,13 +1021,6 @@ cdef class GpuContext: Class that holds all the information pertaining to a context. - :param kind: module name for the context - :type kind: string - :param devno: device number - :type devno: int - :param flags: context flags - :type flags: int - The currently implemented modules (for the `kind` parameter) are "cuda" and "opencl". Which are available depends on the build options for libgpuarray. @@ -1016,6 +1030,16 @@ cdef class GpuContext: one value you must bitwise OR them together. If you want an alternative interface check :meth:`~pygpu.gpuarray.init`. + + Paramters + --------- + kind: str + module name for the context + devno: int + device number + flags: int + context flags + """ def __dealloc__(self): if self.ctx != NULL: @@ -1511,9 +1535,14 @@ def open_ipc_handle(GpuContext c, bytes hpy, size_t l): Open an IPC handle to get a new GpuArray from it. - :param c: context - :param hpy: binary handle data received - :param l: size of the referred memory block + Parameters + ---------- + c: GpuContext + context + hpy: bytes + binary handle data received + l: int + size of the referred memory block """ cdef char *b @@ -1604,11 +1633,16 @@ cdef class GpuArray: to be. It is allowed for this GpuArray and `src` to have different shapes. - :param src: source array in host - :type src: np.ndarray + Parameters + ---------- + src: numpy.ndarray + source array in host - :raises ValueError: If this GpuArray is not compatible with `src` or - if it is not well behaved or contiguous. + Raises + ------ + ValueError + If this GpuArray is not compatible with `src` or if it is + not well behaved or contiguous. """ if not self.flags.behaved: @@ -1646,11 +1680,16 @@ cdef class GpuArray: contiguous. It is allowed for this GpuArray and `dst` to have different shapes. - :param dst: destination array in host - :type dst: np.ndarray + Parameters + ---------- + dst: numpy.ndarray + destination array in host - :raises ValueError: If this GpuArray is not compatible with `src` or - if `dst` is not well behaved. + Raises + ------ + ValueError + If this GpuArray is not compatible with `src` or if `dst` + is not well behaved. """ if not np.PyArray_ISBEHAVED(dst): @@ -1729,8 +1768,11 @@ cdef class GpuArray: Return a copy if this array. - :param order: memory layout of the copy - :type order: string + Parameters + ---------- + order: {'C', 'A', 'F'} + memory layout of the copy + """ return pygpu_copy(self, to_ga_order(order)) @@ -1774,10 +1816,14 @@ cdef class GpuArray: Return a view of this array. - :param cls: class of the view (must inherit from GpuArray) - The returned array shares device data with this one and both will reflect changes made to the other. + + Parameters + ---------- + cls: type + class of the view (must inherit from GpuArray) + """ return pygpu_view(self, cls) @@ -1787,18 +1833,21 @@ cdef class GpuArray: Cast the elements of this array to a new type. - :param dtype: type of the elements of the result - :type dtype: string or numpy.dtype or int - :param order: memory layout of the result - :type order: string - :param copy: Always return a copy? - :type copy: bool - This function returns a new array will all elements cast to the supplied `dtype`, but otherwise unchanged. If `copy` is False and the type and order match `self` is returned. + + Parameters + ---------- + dtype: str or numpy.dtype or int + type of the elements of the result + order: {'A', 'C', 'F'} + memory layout of the result + copy: bool + Always return a copy? + """ cdef GpuArray res cdef int typecode = dtype_to_typecode(dtype) @@ -2205,45 +2254,14 @@ cdef class GpuKernel: Compile a kernel on the device - :param source: complete kernel source code - :type source: string - :param name: function name of the kernel - :type name: string - :param types: list of argument types - :type types: list or tuple - :param context: device on which the kernel is compiled - :type context: GpuContext - :param cluda: use cluda layer? - :param have_double: ensure working doubles? - :param have_small: ensure types smaller than float will work? - :param have_complex: ensure complex types will work? - :param have_half: ensure half-floats will work? - :param binary: kernel is pre-compiled binary blob? - :param cuda: kernel is cuda code? - :param opencl: kernel is opencl code? - The kernel function is retrieved using the provided `name` which must match what you named your kernel in `source`. You can safely reuse the same name multiple times. - .. note:: - - With the cuda backend, unless you use `cluda=True`, you must - either pass the mangled name of your kernel or declare the - function 'extern "C"', because cuda uses a C++ compiler - unconditionally. - The `have_*` parameter are there to tell libgpuarray that we need the particular type or feature to work for this kernel. If the - request can't be satified a - :class:`~pygpu.gpuarray.UnsupportedException` will be raised in the - constructor. - - .. warning:: - - If you do not set the `have_` flags properly, you will either - get a device-specific error (the good case) or silent - completly bogus data (the bad case). + request can't be satified a :class:`.UnsupportedException` will be + raised in the constructor. Once you have the kernel object you can simply call it like so:: @@ -2264,6 +2282,48 @@ cdef class GpuKernel: If you choose to use this interface, make sure to stay within the limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail. + + Parameters + ---------- + source: str + complete kernel source code + name: str + function name of the kernel + types: list or tuple + list of argument types + context: GpuContext + device on which the kernel is compiled + cluda: bool + use cluda layer? + have_double: bool + ensure working doubles? + have_small: bool + ensure types smaller than float will work? + have_complex: bool + ensure complex types will work? + have_half: bool + ensure half-floats will work? + binary: bool + kernel is pre-compiled binary blob? + cuda: bool + kernel is cuda code? + opencl: bool + kernel is opencl code? + + Notes + ----- + With the cuda backend, unless you use `cluda=True`, you must + either pass the mangled name of your kernel or declare the + function 'extern "C"', because cuda uses a C++ compiler + unconditionally. + + .. warning:: + + If you do not set the `have_` flags properly, you will either + get a device-specific error (the good case) or silent + completly bogus data (the bad case). + + """ def __dealloc__(self): cdef unsigned int numargs diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 87716a5551..8053270e1e 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -130,11 +130,6 @@ def _ceil_log2(x): class ReductionKernel(object): def __init__(self, context, dtype_out, neutral, reduce_expr, redux, map_expr=None, arguments=None, preamble="", init_nd=None): - """ - :param init_nd: used to pre compile the reduction code for - this value of nd and the self.init_local_size value. - - """ self.context = context self.neutral = neutral self.redux = tuple(redux) diff --git a/pygpu/tests/main.py b/pygpu/tests/main.py index 6e48005215..ef994e2fac 100644 --- a/pygpu/tests/main.py +++ b/pygpu/tests/main.py @@ -20,12 +20,14 @@ def _test_argv(self, verbose, extra_argv): """ Generate argv for nosetest command - :type verbose: int - :param verbose: Verbosity value for test outputs, in the range 1-10. - Default is 1. + Parameters + ---------- + verbose: int + Verbosity value for test outputs, in the range 1-10. + Default is 1. + extra_argv: list + List with any extra arguments to pass to nosetests. - :type extra_argv: list - :param extra_argv: List with any extra arguments to pass to nosetests. """ #self.package_path = os.path.abspath(self.package_path) argv = [__file__, self.package_path] @@ -79,29 +81,30 @@ def test(self, verbose=1, extra_argv=None, coverage=False, capture=True, """ Run tests for module using nose. - :type verbose: int - :param verbose: Verbosity value for test outputs, in the range 1-10. - Default is 1. + Parameters + ---------- + verbose: int + Verbosity value for test outputs, in the range 1-10. + Default is 1. + extra_argv: list + List with any extra arguments to pass to nosetests. + coverage: bool + If True, report coverage of pygpu code. Default is False. + capture: bool + If True, capture the standard output of the tests, like + nosetests does in command-line. The output of failing + tests will be displayed at the end. Default is True. + knownfailure: bool + If True, tests raising KnownFailureTest will not be + considered Errors nor Failure, but reported as "known + failures" and treated quite like skipped tests. Default + is True. + + Returns + ------- + nose.result.TextTestResult + The result of running the tests - :type extra_argv: list - :param extra_argv: List with any extra arguments to pass to nosetests. - - :type coverage: bool - :param coverage: If True, report coverage of pygpu code. Default is False. - - :type capture: bool - :param capture: If True, capture the standard output of the tests, like - nosetests does in command-line. The output of failing - tests will be displayed at the end. Default is True. - - :type knownfailure: bool - :param knownfailure: If True, tests raising KnownFailureTest will - not be considered Errors nor Failure, but reported as - "known failures" and treated quite like skipped tests. - Default is True. - - :returns: Returns the result of running the tests as a - ``nose.result.TextTestResult`` object. """ # cap verbosity at 3 because nose becomes *very* verbose beyond that verbose = min(verbose, 3) From 4a7a03d7d54d91d7d153602830d735c5247f3202 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 22 Feb 2017 17:19:41 -0500 Subject: [PATCH 301/597] Add doc building to travis. --- .travis.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6802500f08..88bfde8eb6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,9 +17,10 @@ addons: packages: - cmake - cmake-data + - doxygen -#before_install: -# - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install cmake; fi +before_install: + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install doxygen; fi # Build with Debug and Release to flush out build problems script: @@ -32,3 +33,4 @@ script: - cd Release - cmake .. -DCMAKE_BUILD_TYPE=Release - make + - cd doc && make html From d2109bd9415fae1255c17c4ab11a1ada33aeab2b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 28 Feb 2017 18:14:09 -0500 Subject: [PATCH 302/597] Fix typo. --- src/gpuarray/kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index 1f42c3ed64..2cfc7d12ca 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -102,7 +102,7 @@ GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n, * \param n dimensionality of the grid/blocks * \param gs sizes of launch grid * \param ls sizes of launch blocks - * \param shared of dynamic shared memory to allocate + * \param shared amount of dynamic shared memory to allocate * \param args table of pointers to arguments */ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, From a0f8f1f6bbb9ef80efc2cf0bf93196824cf2a596 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 2 Mar 2017 14:07:28 -0500 Subject: [PATCH 303/597] Don't remove the GA_POINTER type to avoid modifying the API. Will remove later. --- src/gen_types.py | 1 + src/gpuarray/types.h | 1 + src/gpuarray_buffer_opencl.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/src/gen_types.py b/src/gen_types.py index 0e87fe23f3..3e14c9a4f2 100644 --- a/src/gen_types.py +++ b/src/gen_types.py @@ -165,6 +165,7 @@ def add_type(name, sz): * List of all built-in types. */ enum GPUARRAY_TYPES { + GA_POINTER = -2, GA_BUFFER = -1, % for i, v in sorted(TYPEMAP.items()): GA_${v[1].upper()} = ${i}, diff --git a/src/gpuarray/types.h b/src/gpuarray/types.h index 2fac29bb37..afd0df16e4 100644 --- a/src/gpuarray/types.h +++ b/src/gpuarray/types.h @@ -43,6 +43,7 @@ typedef struct _gpuarray_type { * List of all built-in types. */ enum GPUARRAY_TYPES { + GA_POINTER = -2, GA_BUFFER = -1, GA_BOOL = 0, GA_BYTE = 1, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index e1491eeffa..812d796920 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1017,6 +1017,8 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) { cl_ulong temp; cl_long stemp; switch (k->types[i]) { + case GA_POINTER: + return error_set(ctx->err, GA_DEVSUP_ERROR, "Cannot set raw pointers as kernel arguments"); case GA_BUFFER: btmp = (gpudata *)a; CL_CHECK(ctx->err, clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf)); From 6347c102749eb00fea39156e2a43fc0bd768780a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 27 Mar 2017 14:58:22 -0400 Subject: [PATCH 304/597] Rework the travis script to make sure we are in the right directory. --- .travis.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 88bfde8eb6..afd34abd85 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,12 +25,7 @@ before_install: # Build with Debug and Release to flush out build problems script: - mkdir Debug - - cd Debug - - cmake .. -DCMAKE_BUILD_TYPE=Debug - - make - - cd .. + - cd Debug && cmake .. -DCMAKE_BUILD_TYPE=Debug && make - mkdir Release - - cd Release - - cmake .. -DCMAKE_BUILD_TYPE=Release - - make + - cd Release && cmake .. -DCMAKE_BUILD_TYPE=Release && make - cd doc && make html From f15ba80a7ab98fd2e52e71001e4110398376466c Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Wed, 19 Apr 2017 16:59:29 -0400 Subject: [PATCH 305/597] s/theano/libgpuarray/ --- doc/_static/version_switch.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/_static/version_switch.js b/doc/_static/version_switch.js index 6c5c338381..bd8f391e3f 100644 --- a/doc/_static/version_switch.js +++ b/doc/_static/version_switch.js @@ -3,7 +3,7 @@ var url = window.location.href; var base_dir = 'libgpuarray'; // directory containing doc - // Default theano version: release and development. + // Default libgpuarray version: release and development. var versions_dir = {"release": "libgpuarray", "dev": "libgpuarray_versions/dev"}; // If doc is run localy @@ -14,7 +14,7 @@ var root_url = url.substring(0, url.search('/' + base_dir)) + '/'; - // Regular expression to find theano version directory in URL. + // Regular expression to find libgpuarray version directory in URL. var version_regex = new RegExp("\\/" + base_dir + "(_versions\\/)?([_a-zA-Z.0-9]*)\\/"); // Get current version @@ -31,7 +31,7 @@ // Build HTML string for version selector, based on ReadTheDocs theme's versions.html var vlabel = current_version.replace("libgpuarray_versions/", ""); - if (vlabel == 'theano') { + if (vlabel == 'libgpuarray') { vlabel = 'release'; } var vswitch = ['
']; From 42326185170d4e6aa8399ddce6df0ac3ec785465 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Thu, 20 Apr 2017 13:25:47 -0400 Subject: [PATCH 306/597] Fix references to renamed enum --- pygpu/collectives.pxd | 2 +- src/gpuarray/buffer_collectives.h | 2 +- src/gpuarray_collectives_cuda_nccl.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pygpu/collectives.pxd b/pygpu/collectives.pxd index dd1f677259..44147febf3 100644 --- a/pygpu/collectives.pxd +++ b/pygpu/collectives.pxd @@ -4,7 +4,7 @@ cdef extern from "gpuarray/buffer_collectives.h": ctypedef struct gpucomm: pass - enum _gpucomm_reduce_ops: + enum gpucomm_reduce_ops: GA_SUM, GA_PROD, GA_MAX, diff --git a/src/gpuarray/buffer_collectives.h b/src/gpuarray/buffer_collectives.h index f5dfef35d8..bad5561814 100644 --- a/src/gpuarray/buffer_collectives.h +++ b/src/gpuarray/buffer_collectives.h @@ -23,7 +23,7 @@ struct _gpucomm; typedef struct _gpucomm gpucomm; /* - * \enum _gpucomm_reduce_ops + * \enum gpucomm_reduce_ops * * \brief Reduction operations */ diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index dc5a901ab7..be12498f97 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -149,7 +149,7 @@ static int get_rank(const gpucomm *comm, int *rank) { } /** - * \brief Helper function to try to convert \ref enum _gpucomm_reduce_ops to + * \brief Helper function to try to convert \ref enum gpucomm_reduce_ops to * \ref * ncclRedOp_t. * From 4d6b5b816b55d8f96985f3b2a869cf1c0aaddb69 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Fri, 21 Apr 2017 08:34:12 -0400 Subject: [PATCH 307/597] Fix tyop --- pygpu/gpuarray.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index cd5310450e..d8d0215565 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1031,8 +1031,8 @@ cdef class GpuContext: If you want an alternative interface check :meth:`~pygpu.gpuarray.init`. - Paramters - --------- + Parameters + ---------- kind: str module name for the context devno: int From 7636b3741d0ef17019c8345e94dea6e88fdc67e8 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Wed, 19 Apr 2017 23:24:30 -0400 Subject: [PATCH 308/597] Fix installation and build on Travis --- .travis.yml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index afd34abd85..e1b24010b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,12 +20,23 @@ addons: - doxygen before_install: + - export PREFIX=$HOME/.local - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install doxygen; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PYTHONUSERBASE=$PREFIX; fi + - pip install --user breathe sphinx sphinx_rtd_theme cython numpy 'mako>=0.7' six + - export PATH=$PATH:$PREFIX/bin + - export CPATH=$CPATH:$PREFIX/include + - export LIBRARY_PATH=$LIBRARY_PATH:$PREFIX/lib + - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PREFIX/lib # Build with Debug and Release to flush out build problems script: - mkdir Debug - - cd Debug && cmake .. -DCMAKE_BUILD_TYPE=Debug && make + - (cd Debug && cmake .. -DCMAKE_BUILD_TYPE=Debug && make) - mkdir Release - - cd Release && cmake .. -DCMAKE_BUILD_TYPE=Release && make - - cd doc && make html + - (cd Release && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX && make && make install) + - python setup.py build_ext --inplace + - (cd doc && make html) + +# Do not treat "shell_session_update: command not found" on MacOS as a failure +after_script: set +e From 1dc961f19798364314dd2735297da695d0d99397 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 21 Apr 2017 21:57:06 -0400 Subject: [PATCH 309/597] Tell about conda-forge --- doc/installation.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/installation.rst b/doc/installation.rst index e348ed0fa2..c388fdb0ba 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -24,6 +24,11 @@ This will also install the libgpuarray package automatically. This should work on Linux, Mac OS and Windows. +This is also available in packages in conda-forge. They could be more +up to date:: + + conda install -c forge pygpu + Build Requirements ------------------ From c52e5a18b2e31888259f16aa4d61f055cc8be872 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Fri, 21 Apr 2017 22:08:21 -0400 Subject: [PATCH 310/597] disambiguate error.h --- doc/c_api/file/error_8h.rst | 2 +- src/gpuarray/error.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/c_api/file/error_8h.rst b/doc/c_api/file/error_8h.rst index c155101143..5b64637104 100644 --- a/doc/c_api/file/error_8h.rst +++ b/doc/c_api/file/error_8h.rst @@ -1,4 +1,4 @@ File error.h ============ -.. doxygenfile:: error.h +.. doxygenfile:: gpuarray/error.h diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h index af963c1531..52aba986b0 100644 --- a/src/gpuarray/error.h +++ b/src/gpuarray/error.h @@ -1,6 +1,6 @@ #ifndef GPUARRAY_ERROR_H #define GPUARRAY_ERROR_H -/** \file error.h +/** \file gpuarray/error.h * \brief Error functions. */ From 73118f0c22019d2ea4bb75e127855c9ff36f6d85 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Fri, 21 Apr 2017 22:10:16 -0400 Subject: [PATCH 311/597] Ignore directories generated with doc --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ec7eecef3f..72c02cdaef 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,8 @@ distribute*tar.gz *.so *.o *.log +doc/_build +doc/_doxybuild pygpu/*.c pygpu/*.h pygpu/version.py From dc2b112d235ccac965e747754de5ee1bab09a59e Mon Sep 17 00:00:00 2001 From: Adam Becker Date: Tue, 25 Apr 2017 13:12:13 +0800 Subject: [PATCH 312/597] add rgemmBatch_3d interface --- pygpu/blas.pyx | 58 ++++++++++++++++++++++++++++++++++++++++ pygpu/tests/test_blas.py | 56 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index 8965f43420..96c2cdbc81 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -18,6 +18,9 @@ cdef extern from "gpuarray/blas.h": double beta, _GpuArray *C, int nocopy) int GpuArray_rger(double alpha, _GpuArray *X, _GpuArray *Y, _GpuArray *A, int nocopy) + int GpuArray_rgemmBatch_3d( + cb_transpose transA, cb_transpose transB, double alpha, + _GpuArray *A, _GpuArray *B, double beta, _GpuArray *C, int nocopy) cdef api int pygpu_blas_rdot(GpuArray X, GpuArray Y, GpuArray Z, bint nocopy) except -1: cdef int err @@ -52,6 +55,17 @@ cdef api int pygpu_blas_rger(double alpha, GpuArray X, GpuArray Y, GpuArray A, raise GpuArrayException(GpuArray_error(&X.ga, err), err) return 0 +cdef api int pygpu_blas_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, + double alpha, GpuArray A, GpuArray B, + double beta, GpuArray C, bint nocopy) except -1: + cdef int err + err = GpuArray_rgemmBatch_3d(transA, transB, + alpha, &A.ga, &B.ga, + beta, &C.ga, nocopy) + if err != GA_NO_ERROR: + raise GpuArrayException(GpuArray_error(&A.ga, err), err) + return 0 + def dot(GpuArray X, GpuArray Y, GpuArray Z=None, overwrite_z=False): """dot(X, Y, Z=None, overwrite_z=False) @@ -153,3 +167,47 @@ def ger(double alpha, GpuArray X, GpuArray Y, GpuArray A=None, pygpu_blas_rger(alpha, X, Y, A, 0) return A + +def gemmBatch_3d(double alpha, GpuArray A, GpuArray B, + double beta, GpuArray C=None, + trans_a=False, trans_b=False, overwrite_c=False): + """gemmBatch_3d(alpha, A, B, beta, C=None, trans_a=False, trans_b=False, overwrite_c=False) + """ + cdef cb_transpose transA + cdef cb_transpose transB + cdef size_t[3] Cshp + + if trans_a: + transA = cb_trans + else: + transA = cb_no_trans + if trans_b: + transB = cb_trans + else: + transB = cb_no_trans + + if A.ga.nd != 3: + raise TypeError, "A is not a batch of matrices" + if B.ga.nd != 3: + raise TypeError, "B is not a batch of matrices" + + Cshp[0] = A.ga.dimensions[0] + if transA == cb_no_trans: + Cshp[1] = A.ga.dimensions[1] + else: + Cshp[1] = A.ga.dimensions[2] + if transB == cb_no_trans: + Cshp[2] = B.ga.dimensions[2] + else: + Cshp[2] = B.ga.dimensions[1] + if C is None: + if beta != 0.0: + raise ValueError, "C not provided and beta != 0" + C = pygpu_empty(3, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) + overwrite_c = True + + if not overwrite_c: + C = pygpu_copy(C, GA_ANY_ORDER) + pygpu_blas_rgemmBatch_3d(transA, transB, alpha, A, B, beta, C, 0) + + return C diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 8ce7d7aebe..b4ec45bdc2 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -167,3 +167,59 @@ def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): gr = gblas.ger(1.0, gX, gY, gA, overwrite_a=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) + +def test_rgemmBatch_3d(): + bools = [False, True] + for b, (m, n, k), order, trans, offseted_o in product( + [1, 17, 31], [(24, 7, 16), (7, 16, 24)], list(product('fc', repeat=3)), + list(product(bools, bools)), bools): + yield rgemmBatch_3d, b, m, n, k, 'float32', order, trans, \ + offseted_o, 1, False, False + for sliced, overwrite, init_res in product( + [1, 2, -1, -2], bools, bools): + yield rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ + (False, False), False, sliced, overwrite, init_res + yield rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'f'), (False, False), \ + False, 1, False, False + for alpha, beta, overwrite in product( + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + yield rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'f'), \ + (False, False), False, 1, overwrite, True, alpha, beta + +@guard_devsup +def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, + init_res, alpha=1.0, beta=0.0): + if trans[0]: + shpA = (b,k,m) + else: + shpA = (b,m,k) + if trans[1]: + shpB = (b,n,k) + else: + shpB = (b,k,n) + + cA, gA = gen_gpuarray(shpA, dtype, order=order[0], + offseted_outer=offseted_o, + sliced=sliced, ctx=context) + cB, gB = gen_gpuarray(shpB, dtype, order=order[1], + offseted_outer=offseted_o, + sliced=sliced, ctx=context) + if init_res: + cC, gC = gen_gpuarray((b,m,n), dtype, order=order[2], ctx=context) + else: + cC, gC = None, None + + cr = numpy.empty((b,m,n), dtype=dtype) + if dtype == 'float32': + fn_gemm_c = fblas.sgemm + else: + fn_gemm_c = fblas.dgemm + for i in range(b): + cCi = cC if cC is None else cC[i] + cr[i] = fn_gemm_c(alpha, cA[i], cB[i], beta, cCi, trans_a=trans[0], + trans_b=trans[1], overwrite_c=overwrite) + + gr = gblas.gemmBatch_3d(alpha, gA, gB, beta, gC, trans_a=trans[0], + trans_b=trans[1], overwrite_c=overwrite) + + numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-5) From ba2f347a42ed60233b9b97fb56c773ece4392fc6 Mon Sep 17 00:00:00 2001 From: Adam Becker Date: Tue, 25 Apr 2017 13:26:06 +0800 Subject: [PATCH 313/597] remove fortran contig test --- pygpu/tests/test_blas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index b4ec45bdc2..b5f5279aa6 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -171,19 +171,19 @@ def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): def test_rgemmBatch_3d(): bools = [False, True] for b, (m, n, k), order, trans, offseted_o in product( - [1, 17, 31], [(24, 7, 16), (7, 16, 24)], list(product('fc', repeat=3)), + [1, 17, 31], [(24, 7, 16), (7, 16, 24)], list(product('fc', 'fc', 'c')), list(product(bools, bools)), bools): yield rgemmBatch_3d, b, m, n, k, 'float32', order, trans, \ offseted_o, 1, False, False for sliced, overwrite, init_res in product( [1, 2, -1, -2], bools, bools): - yield rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ + yield rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'c'), \ (False, False), False, sliced, overwrite, init_res - yield rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'f'), (False, False), \ + yield rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'c'), (False, False), \ False, 1, False, False for alpha, beta, overwrite in product( [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): - yield rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'f'), \ + yield rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'c'), \ (False, False), False, 1, overwrite, True, alpha, beta @guard_devsup From f44000fc59fed8f6f0e90c6bcab374d57a3c69e3 Mon Sep 17 00:00:00 2001 From: Simon Lefrancois Date: Thu, 27 Apr 2017 14:26:23 -0400 Subject: [PATCH 314/597] remove PDF from doc version switcher --- doc/_static/version_switch.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/_static/version_switch.js b/doc/_static/version_switch.js index bd8f391e3f..e64d77baf9 100644 --- a/doc/_static/version_switch.js +++ b/doc/_static/version_switch.js @@ -53,8 +53,8 @@ vswitch.push('
'); vswitch.push('
Downloads
'); - var pdf_url = root_url + current_version + "/libgpuarray.pdf" - vswitch.push('
', 'PDF', '
'); +// var pdf_url = root_url + current_version + "/libgpuarray.pdf" +// vswitch.push('
', 'PDF', '
'); vswitch.push('
'); vswitch.push('
'); From 62cf1b9e23c7ccc06f087614a48894fa034fac5d Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Thu, 27 Apr 2017 16:02:56 -0400 Subject: [PATCH 315/597] A couple more error messages --- src/gpuarray_array_blas.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index eb22fbcf17..44f063500f 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -3,7 +3,9 @@ #include "gpuarray/buffer_blas.h" #include "gpuarray/types.h" #include "gpuarray/util.h" -#include "gpuarray/error.h" + +#include "private.h" +#include "util/error.h" int GpuArray_rdot(GpuArray *X, GpuArray *Y, GpuArray *Z, int nocopy) { @@ -13,24 +15,28 @@ int GpuArray_rdot(GpuArray *X, GpuArray *Y, GpuArray copyY; GpuArray *Zp = Z; size_t n; - void *ctx; + gpucontext *ctx = gpudata_context(Xp->data); size_t elsize; int err; if (X->typecode != GA_HALF && X->typecode != GA_FLOAT && X->typecode != GA_DOUBLE) - return GA_INVALID_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Data type not supported"); if (X->nd != 1 || Y->nd != 1 || Z->nd != 0 || X->typecode != Y->typecode || X->typecode != Z->typecode) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Wrong number of dimensions: X->nd = %d (expected 1), Y->nd = %d (expected 1), Z->nd = %d (expected 0)", + X->nd, Y->nd, Z->nd); n = X->dimensions[0]; if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(Z->flags & GA_ALIGNED)) return GA_UNALIGNED_ERROR; if (X->dimensions[0] != Y->dimensions[0]) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Shape mismatch: X->dimensions[0] = %d != Y->dimensions[0] = %d", + X->dimensions[0], Y->dimensions[0]); elsize = gpuarray_get_elsize(X->typecode); if (X->strides[0] < 0) { @@ -49,12 +55,11 @@ int GpuArray_rdot(GpuArray *X, GpuArray *Y, else { err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Yp = ©Y; } } - ctx = gpudata_context(Xp->data); err = gpublas_setup(ctx); if (err != GA_NO_ERROR) goto cleanup; @@ -138,7 +143,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, else { err = GpuArray_copy(©A, A, GA_F_ORDER); if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Ap = ©A; } } @@ -148,7 +153,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Xp = ©X; } } From 83d186842b92c4f868b1819cb4b6f37735787e95 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Thu, 27 Apr 2017 21:11:34 -0400 Subject: [PATCH 316/597] dump source and error message if compilation fail in DEBUG --- src/gpuarray_buffer_cuda.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 7adfa9a40b..152fc1d3a1 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1103,6 +1103,10 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { if (err != NVRTC_SUCCESS) { nvrtcDestroyProgram(&prog); +#ifdef DEBUG + strb_dump(src, stderr); + strb_dump(log, stderr); +#endif return error_nvrtc(ctx->err, "nvrtcCompileProgram", err); } From 92f9977df2d8c4abd7041a96229c4d0c27909d5b Mon Sep 17 00:00:00 2001 From: Simon Lefrancois Date: Fri, 28 Apr 2017 09:23:11 -0400 Subject: [PATCH 317/597] Remove Downloads title from doc switcher --- doc/_static/version_switch.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/_static/version_switch.js b/doc/_static/version_switch.js index e64d77baf9..3c3685a456 100644 --- a/doc/_static/version_switch.js +++ b/doc/_static/version_switch.js @@ -51,11 +51,11 @@ } vswitch.push('
'); - vswitch.push('
'); - vswitch.push('
Downloads
'); -// var pdf_url = root_url + current_version + "/libgpuarray.pdf" -// vswitch.push('
', 'PDF', '
'); - vswitch.push('
'); +// vswitch.push('
'); +// vswitch.push('
Downloads
'); +// var pdf_url = root_url + current_version + "/libgpuarray.pdf" +// vswitch.push('
', 'PDF', '
'); +// vswitch.push('
'); vswitch.push('
'); vswitch.push('
On GitHub
'); From c11b0b3401fcde5abd65c7b7bd758bbffbf41f39 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 2 May 2017 11:37:27 -0400 Subject: [PATCH 318/597] Remove ifdef as all supported version have it. --- src/gpuarray_collectives_cuda_nccl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index bd5cb3963b..44c1297aa7 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -179,10 +179,8 @@ static inline ncclDataType_t convert_data_type(int typecode) { case GA_DOUBLE: return ncclDouble; case GA_LONG: return ncclInt64; case GA_ULONG: return ncclUint64; - #ifdef CUDA_HAS_HALF case GA_HALF: return ncclHalf; case GA_FLOAT16: return ncclHalf; - #endif } return nccl_NUM_TYPES; } From fc0a61624da20789f25c9e1fe58e816c1f09b97b Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 2 May 2017 17:39:56 -0400 Subject: [PATCH 319/597] Update doc --- doc/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/installation.rst b/doc/installation.rst index 6f66da8313..15c295c8f2 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -27,7 +27,7 @@ This should work on Linux, Mac OS and Windows. This is also available in packages in conda-forge. They could be more up to date:: - conda install -c forge pygpu + conda install -c conda-forge pygpu Build Requirements ------------------ From cef292e8a9e26434ef15b0b98f2abac5f405c9c8 Mon Sep 17 00:00:00 2001 From: Adam Becker Date: Wed, 3 May 2017 17:15:21 +0800 Subject: [PATCH 320/597] use unified exception syntax --- pygpu/blas.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index 96c2cdbc81..ca273ed955 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -92,14 +92,14 @@ def gemv(double alpha, GpuArray A, GpuArray X, double beta=0.0, transA = cb_no_trans if A.ga.nd != 2: - raise TypeError, "A is not a matrix" + raise TypeError("A is not a matrix") if transA == cb_no_trans: Yshp = A.ga.dimensions[0] else: Yshp = A.ga.dimensions[1] if Y is None: if beta != 0.0: - raise ValueError, "Y not provided and beta != 0" + raise ValueError("Y not provided and beta != 0") Y = pygpu_empty(1, &Yshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) overwrite_y = True @@ -127,9 +127,9 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, transB = cb_no_trans if A.ga.nd != 2: - raise TypeError, "A is not a matrix" + raise TypeError("A is not a matrix") if B.ga.nd != 2: - raise TypeError, "B is not a matrix" + raise TypeError("B is not a matrix") if transA == cb_no_trans: Cshp[0] = A.ga.dimensions[0] else: @@ -140,7 +140,7 @@ def gemm(double alpha, GpuArray A, GpuArray B, double beta, GpuArray C=None, Cshp[1] = B.ga.dimensions[0] if C is None: if beta != 0.0: - raise ValueError, "C not provided and beta != 0" + raise ValueError("C not provided and beta != 0") C = pygpu_empty(2, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) overwrite_c = True @@ -187,9 +187,9 @@ def gemmBatch_3d(double alpha, GpuArray A, GpuArray B, transB = cb_no_trans if A.ga.nd != 3: - raise TypeError, "A is not a batch of matrices" + raise TypeError("A is not a batch of matrices") if B.ga.nd != 3: - raise TypeError, "B is not a batch of matrices" + raise TypeError("B is not a batch of matrices") Cshp[0] = A.ga.dimensions[0] if transA == cb_no_trans: @@ -202,7 +202,7 @@ def gemmBatch_3d(double alpha, GpuArray A, GpuArray B, Cshp[2] = B.ga.dimensions[1] if C is None: if beta != 0.0: - raise ValueError, "C not provided and beta != 0" + raise ValueError("C not provided and beta != 0") C = pygpu_empty(3, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) overwrite_c = True From e43d54389f75313e6f20e509a8d64c569420d097 Mon Sep 17 00:00:00 2001 From: Adam Becker Date: Wed, 3 May 2017 17:16:08 +0800 Subject: [PATCH 321/597] more concise if/else --- pygpu/blas.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index ca273ed955..67d4c0beb0 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -204,9 +204,7 @@ def gemmBatch_3d(double alpha, GpuArray A, GpuArray B, if beta != 0.0: raise ValueError("C not provided and beta != 0") C = pygpu_empty(3, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) - overwrite_c = True - - if not overwrite_c: + else: C = pygpu_copy(C, GA_ANY_ORDER) pygpu_blas_rgemmBatch_3d(transA, transB, alpha, A, B, beta, C, 0) From e3e27b6b0d15fd3fa82d89c3b807e056c3998596 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Fri, 5 May 2017 16:02:22 +0200 Subject: [PATCH 322/597] util: Use secure HTTPS URLs in licence header The request to https://www.opensource.org times out, but the domain without *www.* works. ``` $ curl -I https://opensource.org/licenses/bsd-license.php HTTP/1.1 200 OK ``` --- src/util/xxhash.c | 2 +- src/util/xxhash.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/xxhash.c b/src/util/xxhash.c index bd2447ca16..438d69e528 100644 --- a/src/util/xxhash.c +++ b/src/util/xxhash.c @@ -2,7 +2,7 @@ xxHash - Fast Hash algorithm Copyright (C) 2012-2015, Yann Collet -BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) +BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/src/util/xxhash.h b/src/util/xxhash.h index 1d11a095fb..f88ff81369 100644 --- a/src/util/xxhash.h +++ b/src/util/xxhash.h @@ -6,7 +6,7 @@ Header File Copyright (C) 2012-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are From ec443c9db9961adf53eec3024beb13c7d5f5d7f9 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Fri, 5 May 2017 16:04:03 +0200 Subject: [PATCH 323/597] Use secure HTTPS URLs where possible --- CMakeLists.txt | 2 +- INSTALL | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddfefab53c..d6a96e7339 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ PROJECT(libgpuarray C) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: -# http://stackoverflow.com/q/4001736/3257826 +# https://stackoverflow.com/q/4001736/3257826 if(MSVC) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3") else() diff --git a/INSTALL b/INSTALL index 469d6da210..8b39a672dc 100644 --- a/INSTALL +++ b/INSTALL @@ -5,7 +5,7 @@ with a log of the build messages to abergeron@gmail.com. Requirements: - either an OpenCL runtime (with headers) or the CUDA toolkit - - CMake [ http://cmake.org ] (to build) + - CMake [ https://cmake.org ] (to build) Run CMake on the CMakeList.txt file in src/ and build according to your platform. Set CMAKE_INSTALL_PREFIX to your desired path if you @@ -21,4 +21,4 @@ There are instruction for installation in the CMake file which make install on Windows. If you also want the python bindings, run 'python setup.py install' -after building and installing the library which will install pygpu. \ No newline at end of file +after building and installing the library which will install pygpu. From 7777f0cd6334a084699f3c7e4f76cb220403ea83 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Fri, 5 May 2017 16:06:40 +0200 Subject: [PATCH 324/597] doc/installation: Use secure HTTPS URLs where possible --- doc/installation.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 15c295c8f2..880decfa47 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -175,8 +175,8 @@ install step. It is up to you to copy the headers and libraries to an appropriate place. If you don't have Visual Studio installed, you can get the free -Express version from `here `_ in the -downloads section (select the "for Windows" edition). +Express version from `here `_ in the +downloads section (select the "for windows" edition). .. warning:: While you may get the library to compile using cygwin, this is not @@ -226,7 +226,7 @@ you can confirm which device it is running on. only the codename of the architecture the GPU belongs to (e.g. 'Tahiti'). -.. _cmake: http://cmake.org/ +.. _cmake: https://cmake.org/ .. _clblas: https://github.com/clMathLibraries/clBLAS @@ -238,10 +238,10 @@ you can confirm which device it is running on. .. _check: http://check.sourceforge.net/ -.. _python: http://python.org/ +.. _python: https://python.org/ .. _cython: http://cython.org/ -.. _nosetests: http://nose.readthedocs.org/en/latest/ +.. _nosetests: https://nose.readthedocs.org/en/latest/ .. _mako: http://www.makotemplates.org/ From a75162ec5df0fdf8ab954623217bd050437d2012 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Fri, 5 May 2017 16:09:42 +0200 Subject: [PATCH 325/597] doc: Always load Google Analytics script over secure connection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even if the site is not loaded over HTTPS, it’s good practice to load assets over HTTPS if that is possible. --- doc/_templates/layout.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html index 2f40797e03..47d86e61c3 100644 --- a/doc/_templates/layout.html +++ b/doc/_templates/layout.html @@ -9,7 +9,7 @@ (function() { var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; - ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; + ga.src = 'https://ssl.google-analytics.com/ga.js'; var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); })(); From 322681b9023bd382b2815d23209de38b9ba48153 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Mon, 15 May 2017 21:13:09 -0400 Subject: [PATCH 326/597] Use "right" ld* with 1D matrices --- src/gpuarray_array_blas.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 44f063500f..73d3a729aa 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -550,10 +550,14 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph if (cC == 2) { o = cb_fortran; - ldc = Cp->strides[2] / elsize; + ldc = Cp->dimensions[2] > 1 + ? Cp->strides[2] / elsize + : Cp->dimensions[1]; } else if (cC == 1) { o = cb_c; - ldc = Cp->strides[1] / elsize; + ldc = Cp->dimensions[1] > 1 + ? Cp->strides[1] / elsize + : Cp->dimensions[2]; } else { err = GA_VALUE_ERROR; goto cleanup; @@ -579,7 +583,9 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph goto cleanup; } if (cB == 2) { - ldb = Bp->strides[2] / elsize; + ldb = Bp->dimensions[2] > 1 + ? Bp->strides[2] / elsize + : Bp->dimensions[1]; if (o == cb_c) { if (transB == cb_no_trans) transB = cb_trans; @@ -587,7 +593,9 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph transB = cb_no_trans; } } else if (cB == 1) { - ldb = Bp->strides[1] / elsize; + ldb = Bp->dimensions[1] > 1 + ? Bp->strides[1] / elsize + : Bp->dimensions[2]; if (o == cb_fortran) { if (transB == cb_no_trans) transB = cb_trans; From 68c99168da3dd9338ced5d668a5e5a9fb64efb99 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Tue, 16 May 2017 12:21:11 -0400 Subject: [PATCH 327/597] Add another case --- src/gpuarray_array_blas.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 73d3a729aa..11a4b7d05d 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -563,7 +563,9 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph goto cleanup; } if (cA == 2) { - lda = Ap->strides[2] / elsize; + lda = Ap->dimensions[2] > 1 + ? Ap->strides[2] / elsize + : Ap->dimensions[1]; if (o == cb_c) { if (transA == cb_no_trans) transA = cb_trans; @@ -571,7 +573,9 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph transA = cb_no_trans; } } else if (cA == 1) { - lda = Ap->strides[1] / elsize; + lda = Ap->dimensions[1] > 1 + ? Ap->strides[1] / elsize + : Ap->dimensions[2]; if (o == cb_fortran) { if (transA == cb_no_trans) transA = cb_trans; From f7d0df8dab92375e0b67de8b990b58a1f005c8e0 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Tue, 16 May 2017 20:06:13 -0400 Subject: [PATCH 328/597] formatting (tab to spaces) --- src/gpuarray_array_blas.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 11a4b7d05d..536ff66a8d 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -254,7 +254,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, else { err = GpuArray_copy(©A, A, GA_F_ORDER); if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Ap = ©A; } } @@ -264,7 +264,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, else { err = GpuArray_copy(©B, B, GA_F_ORDER); if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Bp = ©B; } } @@ -388,7 +388,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Xp = ©X; } } @@ -398,7 +398,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, else { err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Yp = ©Y; } } @@ -526,7 +526,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph err = GpuArray_copy(©A, A, GA_C_ORDER); cA = 1; if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Ap = ©A; } } @@ -538,7 +538,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph err = GpuArray_copy(©B, B, GA_C_ORDER); cB = 1; if (err != GA_NO_ERROR) - goto cleanup; + goto cleanup; Bp = ©B; } } From e608e17a5e996d6be1ff35e816c28fbe403e6454 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Tue, 16 May 2017 20:46:22 -0400 Subject: [PATCH 329/597] Force downgrade of sphinx to avoid doc build error --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e1b24010b2..ec24483bee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ before_install: - export PREFIX=$HOME/.local - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install doxygen; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PYTHONUSERBASE=$PREFIX; fi - - pip install --user breathe sphinx sphinx_rtd_theme cython numpy 'mako>=0.7' six + - pip install --user breathe sphinx==1.5 sphinx_rtd_theme cython numpy 'mako>=0.7' six - export PATH=$PATH:$PREFIX/bin - export CPATH=$CPATH:$PREFIX/include - export LIBRARY_PATH=$LIBRARY_PATH:$PREFIX/lib From f82d5be75497d5e9b4949e9dbabece0a7d954934 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 17 May 2017 09:18:57 -0400 Subject: [PATCH 330/597] Try to fix travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ec24483bee..8df5a13de5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ before_install: - export PREFIX=$HOME/.local - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install doxygen; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PYTHONUSERBASE=$PREFIX; fi - - pip install --user breathe sphinx==1.5 sphinx_rtd_theme cython numpy 'mako>=0.7' six + - pip install --user breathe sphinx==1.5.1 sphinx_rtd_theme cython numpy 'mako>=0.7' six - export PATH=$PATH:$PREFIX/bin - export CPATH=$CPATH:$PREFIX/include - export LIBRARY_PATH=$LIBRARY_PATH:$PREFIX/lib From 1cc02f562d9c02a4f93fc3d5fe6f9138e449c923 Mon Sep 17 00:00:00 2001 From: Adam Becker Date: Thu, 18 May 2017 22:11:11 +0800 Subject: [PATCH 331/597] add overwrite checking --- pygpu/blas.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/blas.pyx b/pygpu/blas.pyx index 67d4c0beb0..cd778f9a77 100644 --- a/pygpu/blas.pyx +++ b/pygpu/blas.pyx @@ -204,7 +204,7 @@ def gemmBatch_3d(double alpha, GpuArray A, GpuArray B, if beta != 0.0: raise ValueError("C not provided and beta != 0") C = pygpu_empty(3, Cshp, A.ga.typecode, GA_ANY_ORDER, A.context, None) - else: + elif not overwrite_c: C = pygpu_copy(C, GA_ANY_ORDER) pygpu_blas_rgemmBatch_3d(transA, transB, alpha, A, B, beta, C, 0) From 2effb4b834591d663260389bb48bd2e7692112b6 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Thu, 18 May 2017 19:09:20 -0400 Subject: [PATCH 332/597] Add error messages in gpuarray_array.c --- src/gpuarray_array.c | 180 ++++++++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 64 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 069cdd3e02..e361a86240 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -16,6 +16,7 @@ #include "gpuarray/kernel.h" #include "gpuarray/util.h" +#include "util/error.h" #include "util/strb.h" #include "util/xxhash.h" @@ -40,12 +41,12 @@ static uint32_t extcopy_hash(cache_key_t k) { static int ga_extcopy(GpuArray *dst, const GpuArray *src) { struct extcopy_args a, *aa; - gpucontext *ctx = gpudata_context(dst->data); + gpucontext *ctx = GpuArray_context(dst); GpuElemwise *k = NULL; void *args[2]; - if (ctx != gpudata_context(src->data)) - return GA_INVALID_ERROR; + if (ctx != GpuArray_context(src)) + return error_set(ctx->err, GA_INVALID_ERROR, "src and dst context differ"); a.itype = src->typecode; a.otype = dst->typecode; @@ -62,11 +63,12 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { gargs[1].flags = GE_WRITE; k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, 0); if (k == NULL) - return GA_MISC_ERROR; + return error_set(ctx->err, GA_MISC_ERROR, + "Could not instantiate GpuElemwise copy kernel"); aa = memdup(&a, sizeof(a)); if (aa == NULL) { GpuElemwise_free(k); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } if (ctx->extcopy_cache == NULL) ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash, @@ -74,9 +76,11 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { (cache_freev_fn)GpuElemwise_free, ctx->err); if (ctx->extcopy_cache == NULL) - return GA_MISC_ERROR; + return error_set(ctx->err, GA_MISC_ERROR, + "No context cache"); if (cache_add(ctx->extcopy_cache, aa, k) != 0) - return GA_MISC_ERROR; + return error_set(ctx->err, GA_MISC_ERROR, + "Could not store GpuElemwise copy kernel in context cache"); } args[0] = (void *)src; args[1] = (void *)dst; @@ -105,14 +109,14 @@ int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, ord = GA_C_ORDER; if (ord != GA_C_ORDER && ord != GA_F_ORDER) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Invalid order"); for (i = 0; i < nd; i++) { size_t d = dims[i]; /* Check for overflow */ if ((d >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) && - d > 0 && SIZE_MAX / d < size) - return GA_VALUE_ERROR; + d > 0 && SIZE_MAX / d < size) + return error_set(ctx->err, GA_XLARGE_ERROR, "Total array size greater than addressable space"); size *= d; } @@ -127,7 +131,7 @@ int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, a->flags = GA_BEHAVED; if (a->dimensions == NULL || a->strides == NULL) { GpuArray_clear(a); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } /* Mult will not overflow since calloc succeded */ memcpy(a->dimensions, dims, sizeof(size_t)*nd); @@ -176,8 +180,9 @@ int GpuArray_zeros(GpuArray *a, gpucontext *ctx, int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable) { + gpucontext *ctx = gpudata_context(data); if (gpuarray_get_type(typecode)->typecode != typecode) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "typecode mismatch"); assert(data != NULL); a->data = data; gpudata_retain(a->data); @@ -189,7 +194,7 @@ int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, a->flags = (writeable ? GA_WRITEABLE : 0); if (a->dimensions == NULL || a->strides == NULL) { GpuArray_clear(a); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } memcpy(a->dimensions, dims, nd*sizeof(size_t)); memcpy(a->strides, strides, nd*sizeof(ssize_t)); @@ -233,6 +238,7 @@ int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf, } int GpuArray_view(GpuArray *v, const GpuArray *a) { + gpucontext *ctx = GpuArray_context(a); v->data = a->data; gpudata_retain(a->data); v->nd = a->nd; @@ -243,7 +249,7 @@ int GpuArray_view(GpuArray *v, const GpuArray *a) { v->strides = calloc(v->nd, sizeof(ssize_t)); if (v->dimensions == NULL || v->strides == NULL) { GpuArray_clear(v); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } memcpy(v->dimensions, a->dimensions, v->nd*sizeof(size_t)); memcpy(v->strides, a->strides, v->nd*sizeof(ssize_t)); @@ -256,6 +262,7 @@ int GpuArray_sync(GpuArray *a) { int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps) { + gpucontext *ctx = GpuArray_context(a); unsigned int i, new_i; unsigned int new_nd = a->nd; size_t *newdims; @@ -263,7 +270,7 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, size_t new_offset = a->offset; if ((starts == NULL) || (stops == NULL) || (steps == NULL)) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Invalid slice (contains NULL)"); for (i = 0; i < a->nd; i++) { if (steps[i] == 0) new_nd -= 1; @@ -273,31 +280,40 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, if (newdims == NULL || newstrs == NULL) { free(newdims); free(newstrs); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } new_i = 0; for (i = 0; i < a->nd; i++) { if (starts[i] < -1 || (starts[i] > 0 && - (size_t)starts[i] > a->dimensions[i])) { + (size_t)starts[i] > a->dimensions[i])) { free(newdims); free(newstrs); - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Invalid slice value: slice(%lld, %lld, %lld) when " + "indexing array on dimension %u of length %lld", + starts[i], stops[i], steps[i], i, a->dimensions[i]); } if (steps[i] == 0 && - (starts[i] == -1 || (size_t)starts[i] >= a->dimensions[i])) { + (starts[i] == -1 || (size_t)starts[i] >= a->dimensions[i])) { free(newdims); free(newstrs); - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Invalid slice value: slice(%lld, %lld, %lld) when " + "indexing array on dimension %u of length %lld", + starts[i], stops[i], steps[i], i, a->dimensions[i]); } new_offset += starts[i] * a->strides[i]; if (steps[i] != 0) { if ((stops[i] < -1 || (stops[i] > 0 && - (size_t)stops[i] > a->dimensions[i])) || - (stops[i]-starts[i])/steps[i] < 0) { + (size_t)stops[i] > a->dimensions[i])) || + (stops[i]-starts[i])/steps[i] < 0) { free(newdims); free(newstrs); - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Invalid slice value: slice(%lld, %lld, %lld) when " + "indexing array on dimension %u of length %lld", + starts[i], stops[i], steps[i], i, a->dimensions[i]); } newstrs[new_i] = steps[i] * a->strides[i]; newdims[new_i] = (stops[i]-starts[i]+steps[i]- @@ -341,7 +357,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); if (addr32) { sz = "ga_uint"; @@ -413,7 +429,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, " }\n" "}\n"); if (strb_error(&sb)) { - res = GA_MEMORY_ERROR; + res = error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); goto bail; } flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1); @@ -427,6 +443,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int check_error) { + gpucontext *ctx = GpuArray_context(a); size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0}; size_t pl; gpudata *errbuf; @@ -440,27 +457,35 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int addr32 = 0; if (!GpuArray_ISWRITEABLE(a)) - return GA_INVALID_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Destination array (a) not writeable"); if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(i)) - return GA_UNALIGNED_ERROR; + return error_fmt(ctx->err, GA_UNALIGNED_ERROR, + "Not all arrays are aligned: a (%d), b (%d), i (%d)", + GpuArray_ISALIGNED(a), GpuArray_ISALIGNED(v), GpuArray_ISALIGNED(i)); /* a and i have to be C contiguous */ - if (!GpuArray_IS_C_CONTIGUOUS(a) || !GpuArray_IS_C_CONTIGUOUS(i)) - return GA_INVALID_ERROR; + if (!GpuArray_IS_C_CONTIGUOUS(a)) + return error_set(ctx->err, GA_INVALID_ERROR, "Destination array (a) not C-contiguous"); + if (!GpuArray_IS_C_CONTIGUOUS(i)) + return error_set(ctx->err, GA_INVALID_ERROR, "Index array (i) not C-contiguous"); /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */ if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd || a->dimensions[0] != i->dimensions[0]) - return GA_INVALID_ERROR; + return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. " + "v->nd = %llu, a->nd = %llu, i->nd = %llu, a->dimensions[0] = %llu, i->dimensions[0] = %llu", + v->nd, a->nd, i->nd, a->nd > 0 ? a->dimensions[0] : 0, i->nd > 1 ? i->dimensions[0] : 1); n[0] = i->dimensions[0]; n[1] = 1; for (j = 1; j < v->nd; j++) { if (a->dimensions[j] != v->dimensions[j]) - return GA_INVALID_ERROR; + return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. " + "a->dimensions[%llu] = %llu, i->dimensions[%llu] = %llu", + j, a->dimensions[j], j, i->dimensions[j]); n[1] *= v->dimensions[j]; } @@ -472,7 +497,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, if (err != GA_NO_ERROR) return err; - err = gen_take1_kernel(&k, GpuArray_context(a), + err = gen_take1_kernel(&k, ctx, #if DEBUG &errstr, #else @@ -539,6 +564,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, } int GpuArray_setarray(GpuArray *a, const GpuArray *v) { + gpucontext *ctx = GpuArray_context(a); GpuArray tv; size_t sz; ssize_t *strs; @@ -547,7 +573,8 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) { int simple_move = 1; if (a->nd < v->nd) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension error. " + "a->nd = %llu, v->nd = %llu", a->nd, v->nd); if (!GpuArray_ISWRITEABLE(a)) return GA_VALUE_ERROR; @@ -559,9 +586,11 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) { for (i = 0; i < v->nd; i++) { if (v->dimensions[i] != a->dimensions[i+off]) { if (v->dimensions[i] != 1) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, "Shape error. " + "v->dimensions[%u] = %llu, a->dimesions[%u + %u] = %llu", + i, v->dimensions[i], i, off, a->dimensions[i + off]); else - simple_move = 0; + simple_move = 0; } } @@ -576,7 +605,7 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) { strs = calloc(a->nd, sizeof(ssize_t)); if (strs == NULL) - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); for (i = off; i < a->nd; i++) { if (v->dimensions[i-off] == a->dimensions[i]) { @@ -612,6 +641,7 @@ int GpuArray_reshape(GpuArray *res, const GpuArray *a, unsigned int nd, int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord) { + gpucontext *ctx = GpuArray_context(a); ssize_t *newstrides; size_t *tmpdims; size_t np; @@ -637,8 +667,8 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, size_t d = newdims[i]; /* Check for overflow */ if ((d >= MUL_NO_OVERFLOW || newsize >= MUL_NO_OVERFLOW) && - d > 0 && SIZE_MAX / d < newsize) - return GA_INVALID_ERROR; + d > 0 && SIZE_MAX / d < newsize) + return error_set(ctx->err, GA_XLARGE_ERROR, "Output array size greater than addressable space"); newsize *= d; } @@ -653,7 +683,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, newstrides = calloc(nd, sizeof(ssize_t)); if (newstrides == NULL) - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); while (ni < nd && oi < a->nd) { np = newdims[ni]; @@ -707,7 +737,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, Can't do the same with newdims (which is a parameter). */ tmpdims = calloc(nd, sizeof(size_t)); if (tmpdims == NULL) { - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } memcpy(tmpdims, newdims, nd*sizeof(size_t)); a->nd = nd; @@ -719,7 +749,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, goto fix_flags; need_copy: free(newstrides); - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Copy is needed but disallowed by parameters"); do_final_copy: tmpdims = calloc(nd, sizeof(size_t)); @@ -727,7 +757,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, if (tmpdims == NULL || newstrides == NULL) { free(tmpdims); free(newstrides); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } memcpy(tmpdims, newdims, nd*sizeof(size_t)); if (nd > 0) { @@ -766,6 +796,7 @@ int GpuArray_transpose(GpuArray *res, const GpuArray *a, } int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) { + gpucontext *ctx = GpuArray_context(a); size_t *newdims; ssize_t *newstrs; unsigned int i; @@ -777,7 +808,7 @@ int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) { if (newdims == NULL || newstrs == NULL) { free(newdims); free(newstrs); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } for (i = 0; i < a->nd; i++) { @@ -790,7 +821,9 @@ int GpuArray_transpose_inplace(GpuArray *a, const unsigned int *new_axes) { if (j == new_axes[k]) { free(newdims); free(newstrs); - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Repeated axes in transpose: new_axes[%u] == new_axes[%u] == %u", + i, k, j); } } newdims[i] = a->dimensions[j]; @@ -827,17 +860,24 @@ gpucontext *GpuArray_context(const GpuArray *a) { } int GpuArray_move(GpuArray *dst, const GpuArray *src) { + gpucontext *ctx = GpuArray_context(dst); size_t sz; unsigned int i; if (!GpuArray_ISWRITEABLE(dst)) - return GA_VALUE_ERROR; - if (!GpuArray_ISALIGNED(src) || !GpuArray_ISALIGNED(dst)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Destination array (dst) not writeable"); + if (!GpuArray_ISALIGNED(src)) + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Source array (src) not aligned"); + if (!GpuArray_ISALIGNED(dst)) + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Destination array (dst) not aligned"); if (src->nd != dst->nd) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Dimension mismatch. src->nd = %llu, dst->nd = %llu", + src->nd, dst->nd); for (i = 0; i < src->nd; i++) { if (src->dimensions[i] != dst->dimensions[i]) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Dimension mismatch. src->dimensions[%u] = %llu, dst->dimensions[%u] = %llu", + i, src->dimensions[i], i, dst->dimensions[i]); } if (!GpuArray_ISONESEGMENT(dst) || !GpuArray_ISONESEGMENT(src) || GpuArray_ISFORTRAN(dst) != GpuArray_ISFORTRAN(src) || @@ -850,22 +890,25 @@ int GpuArray_move(GpuArray *dst, const GpuArray *src) { } int GpuArray_write(GpuArray *dst, const void *src, size_t src_sz) { + gpucontext *ctx = GpuArray_context(dst); if (!GpuArray_ISWRITEABLE(dst)) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Destination array (dst) not writeable"); if (!GpuArray_ISONESEGMENT(dst)) - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Destination array (dst) not one segment"); return gpudata_write(dst->data, dst->offset, src, src_sz); } int GpuArray_read(void *dst, size_t dst_sz, const GpuArray *src) { + gpucontext *ctx = GpuArray_context(src); if (!GpuArray_ISONESEGMENT(src)) - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (src) not one segment"); return gpudata_read(dst, src->data, src->offset, dst_sz); } int GpuArray_memset(GpuArray *a, int data) { + gpucontext *ctx = GpuArray_context(a); if (!GpuArray_ISONESEGMENT(a)) - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (a) not one segment"); return gpudata_memset(a->data, a->offset, data); } @@ -881,16 +924,17 @@ int GpuArray_copy(GpuArray *res, const GpuArray *a, ga_order order) { } int GpuArray_transfer(GpuArray *res, const GpuArray *a) { + gpucontext *ctx = GpuArray_context(res); size_t sz; unsigned int i; if (!GpuArray_ISONESEGMENT(res)) - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (res) not one segment"); if (!GpuArray_ISONESEGMENT(a)) - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Array (a) not one segment"); if (res->typecode != a->typecode) - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "typecode mismatch"); sz = gpuarray_get_elsize(a->typecode); for (i = 0; i < a->nd; i++) sz *= a->dimensions[i]; @@ -900,6 +944,7 @@ int GpuArray_transfer(GpuArray *res, const GpuArray *a) { int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p, unsigned int axis) { + gpucontext *ctx = GpuArray_context(a); size_t i; ssize_t *starts, *stops, *steps; int err; @@ -912,7 +957,7 @@ int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p, free(starts); free(stops); free(steps); - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); } for (i = 0; i < a->nd; i++) { @@ -949,6 +994,7 @@ int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p, int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n, unsigned int axis, int restype) { + gpucontext *ctx = GpuArray_context(r); size_t *dims, *res_dims; size_t i, res_off; unsigned int p; @@ -956,33 +1002,38 @@ int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n, int err = GA_NO_ERROR; if (axis >= as[0]->nd) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, "Invalid axis. " + "axis = %u, as[0]->nd = %llu", axis, as[0]->nd); dims = calloc(as[0]->nd, sizeof(size_t)); if (dims == NULL) - return GA_MEMORY_ERROR; + return error_fmt(ctx->err, GA_MEMORY_ERROR, "Out of memory"); for (p = 0; p < as[0]->nd; p++) { dims[p] = as[0]->dimensions[p]; } if (!GpuArray_ISALIGNED(as[0])) { - err = GA_UNALIGNED_ERROR; + err = error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned array (as[0])."); goto afterloop; } for (i = 1; i < n; i++) { if (!GpuArray_ISALIGNED(as[i])) { - err = GA_UNALIGNED_ERROR; + err = error_fmt(ctx->err, GA_UNALIGNED_ERROR, "Unaligned array (as[%llu]).", i); goto afterloop; } if (as[i]->nd != as[0]->nd) { - err = GA_VALUE_ERROR; + err = error_fmt(ctx->err, GA_VALUE_ERROR, "Shape mismatch. " + "as[%llu]->nd = %llu, as[0]->nd = %llu", + i, as[i]->nd, as[0]->nd); goto afterloop; } for (p = 0; p < as[0]->nd; p++) { if (p != axis && dims[p] != as[i]->dimensions[p]) { - err = GA_VALUE_ERROR; + err = error_fmt(ctx->err, GA_VALUE_ERROR, "Dimension mismatch. " + "as[%llu]->dimensions[%u] = %llu, as[0]->dimensions[%u] = %llu", + i, p, as[i]->dimensions[p], p, dims[p]); goto afterloop; } else if (p == axis) { dims[p] += as[i]->dimensions[p]; @@ -1067,6 +1118,7 @@ void GpuArray_fprintf(FILE *fd, const GpuArray *a) { } int GpuArray_fdump(FILE *fd, const GpuArray *a) { + gpucontext *ctx = GpuArray_context(a); char *buf, *p; size_t s = GpuArray_ITEMSIZE(a); size_t k; @@ -1078,7 +1130,7 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) { buf = malloc(s); if (buf == NULL) - return GA_MEMORY_ERROR; + return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); err = GpuArray_read(buf, s, a); if (err != GA_NO_ERROR) { From 90c704c654f824cbd7f3c0660041ed5210abd775 Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Fri, 19 May 2017 16:38:44 -0400 Subject: [PATCH 333/597] Fixes after code review and segfault --- src/gpuarray_array.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index e361a86240..c5429863ee 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -76,8 +76,7 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { (cache_freev_fn)GpuElemwise_free, ctx->err); if (ctx->extcopy_cache == NULL) - return error_set(ctx->err, GA_MISC_ERROR, - "No context cache"); + return ctx->err->code; if (cache_add(ctx->extcopy_cache, aa, k) != 0) return error_set(ctx->err, GA_MISC_ERROR, "Could not store GpuElemwise copy kernel in context cache"); @@ -472,11 +471,14 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, return error_set(ctx->err, GA_INVALID_ERROR, "Index array (i) not C-contiguous"); /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */ - if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd || - a->dimensions[0] != i->dimensions[0]) + if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd) + return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. " + "v->nd = %llu, a->nd = %llu, i->nd = %llu", + v->nd, a->nd, i->nd); + if (a->dimensions[0] != i->dimensions[0]) return error_fmt(ctx->err, GA_INVALID_ERROR, "Dimension mismatch. " - "v->nd = %llu, a->nd = %llu, i->nd = %llu, a->dimensions[0] = %llu, i->dimensions[0] = %llu", - v->nd, a->nd, i->nd, a->nd > 0 ? a->dimensions[0] : 0, i->nd > 1 ? i->dimensions[0] : 1); + "a->dimensions[0] = %llu, i->dimensions[0] = %llu", + a->dimensions[0], i->dimensions[0]); n[0] = i->dimensions[0]; n[1] = 1; @@ -994,7 +996,7 @@ int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p, int GpuArray_concatenate(GpuArray *r, const GpuArray **as, size_t n, unsigned int axis, int restype) { - gpucontext *ctx = GpuArray_context(r); + gpucontext *ctx = GpuArray_context(as[0]); size_t *dims, *res_dims; size_t i, res_off; unsigned int p; From 3ef30405b9e0f8182273aee5ac5539d0a6d2ef79 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 23 May 2017 13:17:28 -0400 Subject: [PATCH 334/597] Fix gs length check in python. --- pygpu/gpuarray.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index d8d0215565..34e39da107 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -2455,7 +2455,7 @@ cdef class GpuKernel: raise ValueError, "nd mismatch for gs (int)" gs[0] = py_gs elif isinstance(py_gs, (list, tuple)): - if len(py_gs) < 3: + if len(py_gs) > 3: raise ValueError, "gs is not of length 3 or less" if len(py_ls) != nd: raise ValueError, "nd mismatch for gs (tuple)" From 4ae597172bfc2f8a125d286fe5d2bacfe7d16230 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 23 May 2017 13:17:54 -0400 Subject: [PATCH 335/597] Export useful helpers from elemwise. --- pygpu/elemwise.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pygpu/elemwise.py b/pygpu/elemwise.py index 348ee3fdd4..ab6fd55d94 100644 --- a/pygpu/elemwise.py +++ b/pygpu/elemwise.py @@ -4,7 +4,8 @@ from . import gpuarray from ._elemwise import GpuElemwise, arg -__all__ = ['GpuElemwise', 'elemwise1', 'elemwise2', 'ielemwise2', 'compare'] +__all__ = ['GpuElemwise', 'arg', 'as_argument', + 'elemwise1', 'elemwise2', 'ielemwise2', 'compare'] def _dtype(o): From fa7f5a76d59eb4bba3e1a241400c0356a624062d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 23 May 2017 13:23:41 -0400 Subject: [PATCH 336/597] Avoid leaking the memory for the name. --- pygpu/_elemwise.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pygpu/_elemwise.pyx b/pygpu/_elemwise.pyx index 586fb50bc9..1875503c0c 100644 --- a/pygpu/_elemwise.pyx +++ b/pygpu/_elemwise.pyx @@ -49,6 +49,9 @@ cdef class arg: memset(&self.a, 0, sizeof(gpuelemwise_arg)) def __init__(self, name, type, read=False, write=False, scalar=False): + # Make sure to clear previous storage + # __init__ may be called more than once + free(self.a.name) self.a.name = strdup(to_bytes(name)) if self.a.name is NULL: raise MemoryError @@ -63,6 +66,9 @@ cdef class arg: if self.a.flags == 0: raise ValueError('no flags specified for arg %s' % (name,)) + def __dealloc__(self): + free(self.a.name) + property name: def __get__(self): return self.a.name.decode('ascii') From 54c196b86cbc48fc5604a8fb8ae60266ba73bf96 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 23 May 2017 16:58:43 -0400 Subject: [PATCH 337/597] Remove erroneous mapping. --- src/gpuarray_collectives_cuda_nccl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index e335469080..a90e05dd16 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -180,7 +180,6 @@ static inline ncclDataType_t convert_data_type(int typecode) { case GA_LONG: return ncclInt64; case GA_ULONG: return ncclUint64; case GA_HALF: return ncclHalf; - case GA_FLOAT16: return ncclHalf; } return nccl_NUM_TYPES; } From 4ac185c4af9524320628e299a9c6e448fc73b23a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 24 May 2017 15:21:42 -0400 Subject: [PATCH 338/597] Silcene spurious compiler warnings. --- src/gpuarray_buffer_opencl.c | 2 +- src/gpuarray_collectives_cuda_nccl.c | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 812d796920..58c194432f 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -58,7 +58,7 @@ static int cl_get_device_count(unsigned int platform, unsigned int* devcount) { cl_platform_id p; cl_uint numd; cl_int err; - unsigned int platcount; + unsigned int platcount = 0; /* This will load the library if needed */ GA_CHECK(cl_get_platform_count(&platcount)); diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index a90e05dd16..99cd5f7e38 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -236,8 +236,9 @@ static inline int check_restrictions(gpudata *src, size_t offsrc, static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm *comm) { - ncclRedOp_t op; - ncclDataType_t datatype; + // need dummy init so that compiler shuts up + ncclRedOp_t op = nccl_NUM_OPS; + ncclDataType_t datatype = nccl_NUM_TYPES; gpudata *dst = NULL; int rank = 0; cuda_context *ctx; @@ -285,8 +286,9 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { - ncclRedOp_t op; - ncclDataType_t datatype; + // need dummy init so that compiler shuts up + ncclRedOp_t op = nccl_NUM_OPS; + ncclDataType_t datatype = nccl_NUM_TYPES; cuda_context *ctx; ASSERT_BUF(src); @@ -322,8 +324,9 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { - ncclRedOp_t op; - ncclDataType_t datatype; + // need dummy init so that compiler shuts up + ncclRedOp_t op = nccl_NUM_OPS; + ncclDataType_t datatype = nccl_NUM_TYPES; int ndev = 0; size_t resc_size; cuda_context *ctx; @@ -367,7 +370,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, */ static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { - ncclDataType_t datatype; + // need dummy init so that compiler shuts up + ncclDataType_t datatype = nccl_NUM_TYPES; int rank = 0; cuda_context *ctx; @@ -406,7 +410,8 @@ static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, gpucomm *comm) { - ncclDataType_t datatype; + // need dummy init so that compiler shuts up + ncclDataType_t datatype = nccl_NUM_TYPES; int ndev = 0; size_t resc_size; cuda_context *ctx; From 928042dbd916624e1add8cad952b8f159f18706f Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 29 May 2017 21:25:07 -0400 Subject: [PATCH 339/597] Silence signedness comparison warning --- src/gpuarray_array_blas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 536ff66a8d..86a978935b 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -445,7 +445,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, } static inline int is_last_2d_contiguous(const GpuArray *a) { - size_t size = GpuArray_ITEMSIZE(a); + ssize_t size = GpuArray_ITEMSIZE(a); if (GpuArray_IS_C_CONTIGUOUS(a)) return 1; // C contiguous From ed300e303ab067d9e492874b04625bd67a33607c Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 29 May 2017 21:39:47 -0400 Subject: [PATCH 340/597] Silence -Wtautological-compare. --- src/gpuarray_array_blas.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 86a978935b..6607792ecf 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -217,7 +217,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, return GA_INVALID_ERROR; if (A->nd != 2 || B->nd != 2 || C->nd != 2 || - A->typecode != A->typecode || B->typecode != A->typecode || + B->typecode != A->typecode || C->typecode != A->typecode) return GA_VALUE_ERROR; @@ -367,7 +367,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, return GA_INVALID_ERROR; if (X->nd != 1 || Y->nd != 1 || A->nd != 2 || - X->typecode != X->typecode || Y->typecode != X->typecode || + Y->typecode != X->typecode || A->typecode != X->typecode) return GA_VALUE_ERROR; @@ -483,7 +483,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph return GA_INVALID_ERROR; if (A->nd != 3 || B->nd != 3 || C->nd != 3 || - A->typecode != A->typecode || B->typecode != A->typecode || + B->typecode != A->typecode || C->typecode != A->typecode) return GA_VALUE_ERROR; From 73f3822558561ef7ebd096b6bdaebb9e50b8e666 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 31 May 2017 13:06:36 -0400 Subject: [PATCH 341/597] Add offset to arrays in DEBUG to help catch errors. --- src/gpuarray_array.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index c5429863ee..5b0caa4cbc 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -119,10 +119,20 @@ int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, size *= d; } + /* We add a offset of 64 to all arrays in DEBUG to help catch errors. */ +#ifdef DEBUG + assert(SIZE_MAX - size > 64); + size += 64; +#endif + a->data = gpudata_alloc(ctx, size, NULL, 0, &res); if (a->data == NULL) return res; a->nd = nd; +#ifdef DEBUG + a->offset = 64; +#else a->offset = 0; +#endif a->typecode = typecode; a->dimensions = calloc(nd, sizeof(size_t)); a->strides = calloc(nd, sizeof(ssize_t)); From f863ab2418092971622cac3f121ddf87e1b1b553 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 5 Jun 2017 14:17:14 -0400 Subject: [PATCH 342/597] Emulate pycuda/pyopencl more closely for .gpudata --- pygpu/gpuarray.pyx | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 34e39da107..ebd14f3298 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -2229,13 +2229,40 @@ cdef class GpuArray: def __get__(self): return self.ga.offset + property data: + """Return a pointer to the raw OpenCL buffer object. + + This will fail for arrays that have an offset. + """ + def __get__(self): + if self.context.kind != b"opencl": + raise TypeError("This is for OpenCL arrays.") + if self.offset != 0: + raise ValueError("This array has an offset.") + # This wizadry grabs the actual backend pointer since it's + # guarenteed to be the first element of the gpudata + # structure. + return ((self.ga.data)[0]) + + property base_data: + "Return a pointer to the backing OpenCL object." + def __get__(self): + if self.context.kind != b"opencl": + raise TypeError("This is for OpenCL arrays.") + # This wizadry grabs the actual backend pointer since it's + # guarenteed to be the first element of the gpudata + # structure. + return ((self.ga.data)[0]) + property gpudata: "Return a pointer to the raw backend object." def __get__(self): + if self.context.kind != b"cuda": + raise TypeError("This is for CUDA arrays.") # This wizadry grabs the actual backend pointer since it's # guarenteed to be the first element of the gpudata # structure. - return ((self.ga.data)[0]) + return ((self.ga.data)[0]) + self.offset def __str__(self): return str(numpy.asarray(self)) From c068f749ac0b6485165860a4c70a0aa879e87179 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 5 Jun 2017 15:59:41 -0400 Subject: [PATCH 343/597] Fix offset in triu/tril. --- pygpu/basic.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pygpu/basic.py b/pygpu/basic.py index c66ea38f86..fe8a7bb1ea 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -1,10 +1,11 @@ from string import Template -from .gpuarray import GpuArray, GpuKernel +from .gpuarray import GpuArray, GpuKernel, SIZE def _generate_kernel(ctx, cols, upper=True): tmpl = Template(""" - KERNEL void extract_tri(GLOBAL_MEM ga_float *a, ga_uint N) { + KERNEL void extract_tri(GLOBAL_MEM ga_float *a, ga_size a_off, ga_uint N) { + a = (GLOBAL_MEM ga_float *)(((char *)a) + a_off); unsigned int idx = GID_1 * LDIM_0 * GDIM_0 + GID_0 * LDIM_0 + LID_0; unsigned int ix = idx/${cols}; @@ -20,7 +21,7 @@ def _generate_kernel(ctx, cols, upper=True): else: le = '<' src = tmpl.substitute(cols=cols, le=le) - spec = [GpuArray, 'uint32'] + spec = [GpuArray, SIZE, 'uint32'] k = GpuKernel(src, "extract_tri", spec, context=ctx) return k @@ -40,7 +41,7 @@ def triu(A, inplace=True): upper = True cols = A.shape[1] k = _generate_kernel(A.context, cols, upper) - k(A, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) + k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A @@ -59,5 +60,5 @@ def tril(A, inplace=True): upper = False cols = A.shape[1] k = _generate_kernel(A.context, cols, upper) - k(A, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) + k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A From 1189aa6f159f7070352813c7415dfdc656b24f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Schl=C3=BCter?= Date: Tue, 6 Jun 2017 19:41:19 +0200 Subject: [PATCH 344/597] Use blocking_sync instead of yield when configured for multi-threading --- src/gpuarray_buffer_cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 152fc1d3a1..48112dd88c 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -501,7 +501,7 @@ static cuda_context *do_init(CUdevice dev, int flags, error *e) { if (flags & GA_CTX_SINGLE_THREAD) fl = CU_CTX_SCHED_SPIN; if (flags & GA_CTX_MULTI_THREAD) - fl = CU_CTX_SCHED_YIELD; + fl = CU_CTX_SCHED_BLOCKING_SYNC; err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); CHKFAIL(e, "cuDeviceGetAttribute", NULL); if (i != 1) { From f501f1ee8d95df83f118c90f9f316534a7df6433 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Jun 2017 15:30:48 -0400 Subject: [PATCH 345/597] Make sure all error have a message. --- src/gpuarray_array.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index c5429863ee..127f5e1292 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -63,12 +63,11 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { gargs[1].flags = GE_WRITE; k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, 0); if (k == NULL) - return error_set(ctx->err, GA_MISC_ERROR, - "Could not instantiate GpuElemwise copy kernel"); + return ctx->err->code; aa = memdup(&a, sizeof(a)); if (aa == NULL) { GpuElemwise_free(k); - return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); + return error_sys(ctx->err, "memdup"); } if (ctx->extcopy_cache == NULL) ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash, @@ -120,7 +119,7 @@ int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, } a->data = gpudata_alloc(ctx, size, NULL, 0, &res); - if (a->data == NULL) return res; + if (a->data == NULL) return ctx->err->code; a->nd = nd; a->offset = 0; a->typecode = typecode; @@ -130,7 +129,7 @@ int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, a->flags = GA_BEHAVED; if (a->dimensions == NULL || a->strides == NULL) { GpuArray_clear(a); - return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); + return error_sys(ctx->err, "calloc"); } /* Mult will not overflow since calloc succeded */ memcpy(a->dimensions, dims, sizeof(size_t)*nd); @@ -279,7 +278,7 @@ int GpuArray_index_inplace(GpuArray *a, const ssize_t *starts, if (newdims == NULL || newstrs == NULL) { free(newdims); free(newstrs); - return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); + return error_sys(ctx->err, "calloc"); } new_i = 0; @@ -579,9 +578,9 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) { "a->nd = %llu, v->nd = %llu", a->nd, v->nd); if (!GpuArray_ISWRITEABLE(a)) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Destination array not writable"); if (!GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(a)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "One of the inputs is unaligned"); off = a->nd - v->nd; @@ -674,7 +673,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, newsize *= d; } - if (newsize != oldsize) return GA_INVALID_ERROR; + if (newsize != oldsize) return error_set(ctx->err, GA_INVALID_ERROR, "New shope differs in total size"); /* If the source and desired layouts are the same, then just copy strides and dimensions */ @@ -685,7 +684,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, newstrides = calloc(nd, sizeof(ssize_t)); if (newstrides == NULL) - return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); + return error_sys(ctx->err, "calloc"); while (ni < nd && oi < a->nd) { np = newdims[ni]; @@ -739,7 +738,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, Can't do the same with newdims (which is a parameter). */ tmpdims = calloc(nd, sizeof(size_t)); if (tmpdims == NULL) { - return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); + return error_sys(ctx->err, "calloc"); } memcpy(tmpdims, newdims, nd*sizeof(size_t)); a->nd = nd; @@ -759,7 +758,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, if (tmpdims == NULL || newstrides == NULL) { free(tmpdims); free(newstrides); - return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); + return error_sys(ctx->err, "calloc"); } memcpy(tmpdims, newdims, nd*sizeof(size_t)); if (nd > 0) { @@ -959,7 +958,7 @@ int GpuArray_split(GpuArray **rs, const GpuArray *a, size_t n, size_t *p, free(starts); free(stops); free(steps); - return error_set(ctx->err, GA_MEMORY_ERROR, "Out of memory"); + return error_sys(ctx->err, "calloc"); } for (i = 0; i < a->nd; i++) { @@ -1160,7 +1159,7 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) { default: free(buf); fprintf(fd, "\n", a->typecode); - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Unsupported data type for dump"); } s -= gpuarray_get_elsize(a->typecode); p += gpuarray_get_elsize(a->typecode); From 6ba0ef58f41bbb5605ccbbfd6aab0fe8f3bb01d2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Jun 2017 16:21:10 -0400 Subject: [PATCH 346/597] Errors for blas. --- src/gpuarray_array_blas.c | 97 ++++++++++++++++----------------- src/gpuarray_blas_cuda_cublas.c | 6 +- 2 files changed, 51 insertions(+), 52 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 6607792ecf..61646062b9 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -32,7 +32,7 @@ int GpuArray_rdot(GpuArray *X, GpuArray *Y, n = X->dimensions[0]; if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(Z->flags & GA_ALIGNED)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "One of the inputs is unaligned"); if (X->dimensions[0] != Y->dimensions[0]) return error_fmt(ctx->err, GA_VALUE_ERROR, "Shape mismatch: X->dimensions[0] = %d != Y->dimensions[0] = %d", @@ -41,7 +41,7 @@ int GpuArray_rdot(GpuArray *X, GpuArray *Y, elsize = gpuarray_get_elsize(X->typecode); if (X->strides[0] < 0) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Copy required for X"); else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) @@ -51,7 +51,7 @@ int GpuArray_rdot(GpuArray *X, GpuArray *Y, } if (Y->strides[0] < 0) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Copy required for Y"); else { err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); if (err != GA_NO_ERROR) @@ -102,24 +102,24 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; - void *ctx; size_t elsize; size_t m, n, lda; cb_order o; int err; + void *ctx = gpudata_context(Ap->data); if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) - return GA_INVALID_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported data type"); if (A->nd != 2 || X->nd != 1 || Y->nd != 1 || X->typecode != A->typecode || Y->typecode != A->typecode) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Bad shape or inconsistent types"); if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"); if (transA == cb_no_trans) { m = A->dimensions[0]; @@ -130,7 +130,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, } if (Y->dimensions[0] != m || X->dimensions[0] != n) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent shapes"); m = A->dimensions[0]; n = A->dimensions[1]; @@ -139,7 +139,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, if (!GpuArray_ISONESEGMENT(A)) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Copy required for A"); else { err = GpuArray_copy(©A, A, GA_F_ORDER); if (err != GA_NO_ERROR) @@ -149,7 +149,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, } if (X->strides[0] < 0) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Copy required for X"); else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) @@ -158,7 +158,7 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, } } if (Y->strides[0] < 0) { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Negative strides for Y"); goto cleanup; } @@ -170,11 +170,10 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, lda = Ap->dimensions[1]; } else { /* Might be worth looking at making degenerate matrices (1xn) work here. */ - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } - ctx = gpudata_context(Ap->data); err = gpublas_setup(ctx); if (err != GA_NO_ERROR) goto cleanup; @@ -206,7 +205,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *Bp = B; GpuArray copyB; GpuArray *Cp = C; - void *ctx; + void *ctx = gpudata_context(Ap->data); size_t elsize; size_t m, n, k, lda, ldb, ldc; cb_order o; @@ -214,16 +213,16 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) - return GA_INVALID_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported type"); if (A->nd != 2 || B->nd != 2 || C->nd != 2 || B->typecode != A->typecode || C->typecode != A->typecode) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent nd or types"); if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) || !(C->flags & GA_ALIGNED)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"); if (transA == cb_no_trans) { m = A->dimensions[0]; @@ -236,21 +235,21 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, if (transB == cb_no_trans) { n = B->dimensions[1]; if (B->dimensions[0] != k) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes"); } else { n = B->dimensions[0]; if (B->dimensions[1] != k) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes"); } if (C->dimensions[0] != m || C->dimensions[1] != n) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "mismatched shapes"); elsize = gpuarray_get_elsize(A->typecode); if (!GpuArray_ISONESEGMENT(A)) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Need copy for A"); else { err = GpuArray_copy(©A, A, GA_F_ORDER); if (err != GA_NO_ERROR) @@ -260,7 +259,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, } if (!GpuArray_ISONESEGMENT(B)) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Need copy for B"); else { err = GpuArray_copy(©B, B, GA_F_ORDER); if (err != GA_NO_ERROR) @@ -269,7 +268,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, } } if (!GpuArray_ISONESEGMENT(C)) { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous C"); goto cleanup; } @@ -280,7 +279,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, o = cb_c; ldc = Cp->dimensions[1]; } else { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous C"); goto cleanup; } if (Ap->flags & GA_F_CONTIGUOUS) { @@ -300,7 +299,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, transA = cb_no_trans; } } else { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } if (Bp->flags & GA_F_CONTIGUOUS) { @@ -320,7 +319,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, transB = cb_no_trans; } } else { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous B"); goto cleanup; } @@ -356,7 +355,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, GpuArray *Yp = Y; GpuArray copyY; GpuArray *Ap = A; - void *ctx; + void *ctx = gpudata_context(Xp->data); size_t elsize; size_t m, n, lda; cb_order o; @@ -364,27 +363,27 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, if (X->typecode != GA_HALF && X->typecode != GA_FLOAT && X->typecode != GA_DOUBLE) - return GA_INVALID_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported type"); if (X->nd != 1 || Y->nd != 1 || A->nd != 2 || Y->typecode != X->typecode || A->typecode != X->typecode) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Invalid dims or inconsistent types"); if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(A->flags & GA_ALIGNED)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"; m = X->dimensions[0]; n = Y->dimensions[0]; if (A->dimensions[0] != m || A->dimensions[1] != n) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Incompatible shapes"; elsize = gpuarray_get_elsize(X->typecode); if (X->strides[0] < 0) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Need copy for X"); else { err = GpuArray_copy(©X, X, GA_ANY_ORDER); if (err != GA_NO_ERROR) @@ -394,7 +393,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, } if (Y->strides[0] < 0) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Need copy for Y"); else { err = GpuArray_copy(©Y, Y, GA_ANY_ORDER); if (err != GA_NO_ERROR) @@ -403,7 +402,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, } } if (!GpuArray_ISONESEGMENT(A)) { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } @@ -415,7 +414,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, lda = Ap->dimensions[1]; } else { /* Might be worth looking at making degenerate matrices (1xn) work here. */ - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous A"); goto cleanup; } @@ -469,7 +468,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph GpuArray *Bp = B; GpuArray copyB; GpuArray *Cp = C; - void *ctx; + void *ctx = gpudata_context(A->data); size_t elsize; size_t batchCount, m, n, k, lda, ldb, ldc; cb_order o; @@ -480,20 +479,20 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph size_t i; if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) - return GA_INVALID_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported type"); if (A->nd != 3 || B->nd != 3 || C->nd != 3 || B->typecode != A->typecode || C->typecode != A->typecode) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Invalid dims or inconsistent types"); if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) || !(C->flags & GA_ALIGNED)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); batchCount = A->dimensions[0]; if (B->dimensions[0] != batchCount || C->dimensions[0] != batchCount) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched first dimension"); if (transA == cb_no_trans) { m = A->dimensions[1]; @@ -506,22 +505,22 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph if (transB == cb_no_trans) { n = B->dimensions[2]; if (B->dimensions[1] != k) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape"); } else { n = B->dimensions[1]; if (B->dimensions[2] != k) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape"); } if (C->dimensions[1] != m || C->dimensions[2] != n) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Mismatched shape"); elsize = gpuarray_get_elsize(A->typecode); cA = is_last_2d_contiguous(A); if (!cA) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Need copy for A"); else { err = GpuArray_copy(©A, A, GA_C_ORDER); cA = 1; @@ -533,7 +532,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph cB = is_last_2d_contiguous(B); if (!cB) { if (nocopy) - return GA_COPY_ERROR; + return error_set(ctx->err, GA_COPY_ERROR, "Need copy for B"); else { err = GpuArray_copy(©B, B, GA_C_ORDER); cB = 1; @@ -544,7 +543,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph } cC = is_last_2d_contiguous(C); if (!cC) { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Noncontiguous last 2d C"); goto cleanup; } @@ -559,7 +558,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph ? Cp->strides[1] / elsize : Cp->dimensions[2]; } else { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Invalid internal result for C"); goto cleanup; } if (cA == 2) { @@ -583,7 +582,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph transA = cb_no_trans; } } else { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Invalid internal result for A"); goto cleanup; } if (cB == 2) { @@ -607,7 +606,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph transB = cb_no_trans; } } else { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Invalid internal result for B"); goto cleanup; } diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index c207f84a33..3d4cdf16bf 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -461,7 +461,7 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(C); if (cublasSgemmEx == NULL) - return GA_DEVSUP_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx unavailable"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || @@ -1334,7 +1334,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Input too large"); gs[2] = (65535 / (gs[0] * gs[1])); } @@ -1465,7 +1465,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Input too large"); gs[2] = (65535 / (gs[0] * gs[1])); } From 14cae8346b3a8631ef98e539a42caf3589c343e0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Jun 2017 16:21:41 -0400 Subject: [PATCH 347/597] Errors for collectives. --- src/gpuarray_array_collectives.c | 13 +++++++------ src/gpuarray_buffer_collectives.c | 18 +++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/gpuarray_array_collectives.c b/src/gpuarray_array_collectives.c index 4e177ef6c6..8ebec20e16 100644 --- a/src/gpuarray_array_collectives.c +++ b/src/gpuarray_array_collectives.c @@ -29,15 +29,16 @@ static inline size_t find_total_elems(const GpuArray* array) { static inline int check_gpuarrays(int times_src, const GpuArray* src, int times_dest, const GpuArray* dest, size_t* count) { + gpucontext *ctx = gpudata_context(src->data); size_t count_src, count_dest; count_src = find_total_elems(src); count_dest = find_total_elems(dest); if (times_src * count_src != times_dest * count_dest) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Size mismatch for transfer"); if (src->typecode != dest->typecode) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "Type mismatch"); if (!GpuArray_ISALIGNED(src) || !GpuArray_CHKFLAGS(dest, GA_BEHAVED)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Misbehaved arrays"); if (times_src >= times_dest) *count = count_src; @@ -50,7 +51,7 @@ int GpuArray_reduce_from(const GpuArray* src, int opcode, int root, gpucomm* comm) { size_t total_elems; if (!GpuArray_ISALIGNED(src)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); total_elems = find_total_elems(src); return gpucomm_reduce(src->data, src->offset, NULL, 0, total_elems, src->typecode, opcode, root, comm); @@ -95,10 +96,10 @@ int GpuArray_broadcast(GpuArray* array, int root, gpucomm* comm) { GA_CHECK(gpucomm_get_rank(comm, &rank)); if (rank == root) { if (!GpuArray_CHKFLAGS(array, GA_BEHAVED)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); } else { if (!GpuArray_ISALIGNED(array)) - return GA_UNALIGNED_ERROR; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); } total_elems = find_total_elems(array); diff --git a/src/gpuarray_buffer_collectives.c b/src/gpuarray_buffer_collectives.c index 38e731286e..803055bef3 100644 --- a/src/gpuarray_buffer_collectives.c +++ b/src/gpuarray_buffer_collectives.c @@ -8,7 +8,7 @@ int gpucomm_new(gpucomm** comm, gpucontext* ctx, gpucommCliqueId comm_id, int ndev, int rank) { if (ctx->comm_ops == NULL) { *comm = NULL; - return GA_UNSUPPORTED_ERROR; + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Collectives unavailable"); } return ctx->comm_ops->comm_new(comm, ctx, comm_id, ndev, rank); } @@ -30,21 +30,21 @@ gpucontext* gpucomm_context(gpucomm* comm) { } int gpucomm_gen_clique_id(gpucontext* ctx, gpucommCliqueId* comm_id) { if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->generate_clique_id(ctx, comm_id); } int gpucomm_get_count(gpucomm* comm, int* gpucount) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->get_count(comm, gpucount); } int gpucomm_get_rank(gpucomm* comm, int* rank) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->get_rank(comm, rank); } @@ -53,7 +53,7 @@ int gpucomm_reduce(gpudata* src, size_t offsrc, gpudata* dest, size_t offdest, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->reduce(src, offsrc, dest, offdest, count, typecode, opcode, root, comm); } @@ -63,7 +63,7 @@ int gpucomm_all_reduce(gpudata* src, size_t offsrc, gpudata* dest, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->all_reduce(src, offsrc, dest, offdest, count, typecode, opcode, comm); } @@ -73,7 +73,7 @@ int gpucomm_reduce_scatter(gpudata* src, size_t offsrc, gpudata* dest, int opcode, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->reduce_scatter(src, offsrc, dest, offdest, count, typecode, opcode, comm); } @@ -82,7 +82,7 @@ int gpucomm_broadcast(gpudata* array, size_t offset, size_t count, int typecode, int root, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->broadcast(array, offset, count, typecode, root, comm); } @@ -91,7 +91,7 @@ int gpucomm_all_gather(gpudata* src, size_t offsrc, gpudata* dest, gpucomm* comm) { gpucontext* ctx = gpucomm_context(comm); if (ctx->comm_ops == NULL) - return GA_COMM_ERROR; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives unavailable"); return ctx->comm_ops->all_gather(src, offsrc, dest, offdest, count, typecode, comm); } From 885129241dde64d422635518e89eea32cc8716cc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Jun 2017 16:22:00 -0400 Subject: [PATCH 348/597] Errors all around. --- src/gpuarray_buffer.c | 7 ++++--- src/gpuarray_buffer_opencl.c | 8 -------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index b457fb3dcc..4546a7bc41 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -6,6 +6,7 @@ #include "gpuarray/buffer_collectives.h" #include "gpuarray/error.h" +#include "util/error.h" #include "private.h" extern const gpuarray_buffer_ops cuda_ops; @@ -22,7 +23,7 @@ const gpuarray_buffer_ops *gpuarray_get_ops(const char *name) { int gpu_get_platform_count(const char* name, unsigned int* platcount) { const gpuarray_buffer_ops* ops = gpuarray_get_ops(name); if (ops == NULL) { - return GA_INVALID_ERROR; + return error_set(&global_err, GA_INVALID_ERROR, "Invalid platform"); } return ops->get_platform_count(platcount); } @@ -31,7 +32,7 @@ int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount) { const gpuarray_buffer_ops* ops = gpuarray_get_ops(name); if (ops == NULL) { - return GA_INVALID_ERROR; + return error_set(&global_err, GA_INVALID_ERROR, "Invalid platform"); } return ops->get_device_count(platform, devcount); } @@ -121,7 +122,7 @@ int gpudata_transfer(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, /* Fallback to host copy */ tmp = malloc(sz); if (tmp == NULL) - return GA_MEMORY_ERROR; + return error_sys(ctx->err, "malloc"); res = src_ctx->ops->buffer_read(tmp, src, srcoff, sz); if (res != GA_NO_ERROR) { free(tmp); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 58c194432f..8e97f50167 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -808,14 +808,6 @@ static int cl_check_extensions(const char **preamble, unsigned int *count, if (flags & GA_USE_COMPLEX) { return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex are not supported yet"); } - // GA_USE_HALF should always work - /* - if (flags & GA_USE_HALF) { - if (check_ext(ctx, CL_HALF)) return GA_DEVSUP_ERROR; - preamble[*count] = PRAGMA CL_HALF ENABLE; - (*count)++; - } - */ if (flags & GA_USE_CUDA) { return error_set(ctx->err, GA_DEVSUP_ERROR, "Cuda kernels not supported on opencl devices"); } From c769bcb6326669172343700b8c068bbe74bf8ffa Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Jun 2017 16:41:44 -0400 Subject: [PATCH 349/597] Errors in elemwise/kernel --- src/gpuarray_elemwise.c | 28 ++++++++++++++++------------ src/gpuarray_kernel.c | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 840c190473..3681ca08d1 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -150,7 +150,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, ktypes = calloc(p, sizeof(int)); if (ktypes == NULL) - return GA_MEMORY_ERROR; + return error_sys(ctx->err, "calloc"); p = 0; @@ -282,23 +282,22 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, if (num_arrays == 0) nd = ((GpuArray *)args[i])->nd; else if (((GpuArray *)args[i])->nd != nd) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, "Arg %u has differing nd = %u", i, ((GpuArray *)args[i])->nd); ++num_arrays; if (a == NULL && is_output(ge->args[i])) a = (GpuArray *)args[i]; } } - /* No output arrays, this is an error */ if (a == NULL) - return GA_VALUE_ERROR; + return error_set(ctx->err, GA_VALUE_ERROR, "No output arrays"); /* Check if we need to grow the internal buffers */ if (nd > ge->nd) { nnd = ge->nd * 2; while (nd > nnd) nnd *= 2; if (ge_grow(ge, nnd)) - return GA_MEMORY_ERROR; + return error_sys(ctx->err, "ge_grow"); } /* Now we know that all array arguments have the same number of @@ -330,7 +329,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, /* We can't broadcast outputs */ if (ISCLR(flags, GE_BROADCAST) || is_output(ge->args[i]) || v->dimensions[j] != 1) { - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u", j, i); } } /* If the dimension is 1 set the strides to 0 regardless since @@ -370,7 +369,7 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd, unsigned int p = 0, i, j, l; int err; - if (nd == 0) return GA_VALUE_ERROR; + if (nd == 0) return error_set(GpuKernel_context(&ge->k_contig)->err, GA_VALUE_ERROR, "nd == 0"); if (call32) k = &ge->k_basic_32[nd-1]; @@ -434,7 +433,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, unsigned int p; unsigned int j; int flags = GA_USE_CLUDA; - int res = GA_MEMORY_ERROR; + int res; flags |= gpuarray_type_flagsa(n, args); @@ -443,8 +442,10 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, p += ISSET(args[j].flags, GE_SCALAR) ? 1 : 2; ktypes = calloc(p, sizeof(int)); - if (ktypes == NULL) + if (ktypes == NULL) { + res = error_fmt(ctx->err, "calloc"); goto bail; + } p = 0; @@ -509,8 +510,10 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, } strb_appends(&sb, "}\n}\n"); - if (strb_error(&sb)) + if (strb_error(&sb)) { + error_set(ctx->err, GA_MISC_ERROR, "Formatting error creating kernel source"); goto bail; + } res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "elem", p, ktypes, flags, err_str); @@ -523,6 +526,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, static int check_contig(GpuElemwise *ge, void **args, size_t *_n, int *contig) { GpuArray *a = NULL, *v; + gpucontext *ctx = GpuKernel_context(&ge->k_contig); size_t n = 1; unsigned int i, j; int c_contig = 1, f_contig = 1; @@ -538,10 +542,10 @@ static int check_contig(GpuElemwise *ge, void **args, f_contig &= GpuArray_IS_F_CONTIGUOUS(v); if (a != v) { if (a->nd != v->nd) - return GA_INVALID_ERROR; + return error_set(ctx->err, GA_INVALID_ERROR, "Mismatched nd"); for (j = 0; j < a->nd; j++) { if (v->dimensions[j] != a->dimensions[j]) - return GA_VALUE_ERROR; + return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u", j); } } } diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c index 58311c86bb..f513285bf3 100644 --- a/src/gpuarray_kernel.c +++ b/src/gpuarray_kernel.c @@ -12,7 +12,7 @@ int GpuKernel_init(GpuKernel *k, gpucontext *ctx, unsigned int count, k->args = calloc(argcount, sizeof(void *)); if (k->args == NULL) - return GA_MEMORY_ERROR; + return error_sys(ctx->err, "calloc"); k->k = gpukernel_init(ctx, count, strs, lens, name, argcount, types, flags, &res, err_str); if (res != GA_NO_ERROR) From e0be74d472737b914162f334510f75a7c8d32372 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Jun 2017 16:50:49 -0400 Subject: [PATCH 350/597] Fix compile. --- src/gpuarray_array_blas.c | 12 ++++++------ src/gpuarray_array_collectives.c | 7 +++++-- src/gpuarray_buffer.c | 10 ++++++---- src/gpuarray_elemwise.c | 5 +++-- src/gpuarray_kernel.c | 3 +++ 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 61646062b9..53681da29d 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -102,11 +102,11 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; + gpucontext *ctx = gpudata_context(Ap->data); size_t elsize; size_t m, n, lda; cb_order o; int err; - void *ctx = gpudata_context(Ap->data); if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && @@ -205,7 +205,7 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *Bp = B; GpuArray copyB; GpuArray *Cp = C; - void *ctx = gpudata_context(Ap->data); + gpucontext *ctx = gpudata_context(Ap->data); size_t elsize; size_t m, n, k, lda, ldb, ldc; cb_order o; @@ -355,7 +355,7 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, GpuArray *Yp = Y; GpuArray copyY; GpuArray *Ap = A; - void *ctx = gpudata_context(Xp->data); + gpucontext *ctx = gpudata_context(Xp->data); size_t elsize; size_t m, n, lda; cb_order o; @@ -372,12 +372,12 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(A->flags & GA_ALIGNED)) - return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"; + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned inputs"); m = X->dimensions[0]; n = Y->dimensions[0]; if (A->dimensions[0] != m || A->dimensions[1] != n) - return error_set(ctx->err, GA_VALUE_ERROR, "Incompatible shapes"; + return error_set(ctx->err, GA_VALUE_ERROR, "Incompatible shapes"); elsize = gpuarray_get_elsize(X->typecode); @@ -468,7 +468,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph GpuArray *Bp = B; GpuArray copyB; GpuArray *Cp = C; - void *ctx = gpudata_context(A->data); + gpucontext *ctx = gpudata_context(A->data); size_t elsize; size_t batchCount, m, n, k, lda, ldb, ldc; cb_order o; diff --git a/src/gpuarray_array_collectives.c b/src/gpuarray_array_collectives.c index 8ebec20e16..c53249bbd3 100644 --- a/src/gpuarray_array_collectives.c +++ b/src/gpuarray_array_collectives.c @@ -49,6 +49,7 @@ static inline int check_gpuarrays(int times_src, const GpuArray* src, int GpuArray_reduce_from(const GpuArray* src, int opcode, int root, gpucomm* comm) { + gpucontext *ctx = gpudata_context(src->data); size_t total_elems; if (!GpuArray_ISALIGNED(src)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned input"); @@ -90,9 +91,11 @@ int GpuArray_reduce_scatter(const GpuArray* src, GpuArray* dest, int opcode, comm); } -int GpuArray_broadcast(GpuArray* array, int root, gpucomm* comm) { - int rank = 0; +int GpuArray_broadcast(GpuArray *array, int root, gpucomm *comm) { + gpucontext *ctx = gpudata_context(array->data); size_t total_elems; + int rank = 0; + GA_CHECK(gpucomm_get_rank(comm, &rank)); if (rank == root) { if (!GpuArray_CHKFLAGS(array, GA_BEHAVED)) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 4546a7bc41..52af2ddf72 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -23,7 +23,7 @@ const gpuarray_buffer_ops *gpuarray_get_ops(const char *name) { int gpu_get_platform_count(const char* name, unsigned int* platcount) { const gpuarray_buffer_ops* ops = gpuarray_get_ops(name); if (ops == NULL) { - return error_set(&global_err, GA_INVALID_ERROR, "Invalid platform"); + return error_set(global_err, GA_INVALID_ERROR, "Invalid platform"); } return ops->get_platform_count(platcount); } @@ -32,7 +32,7 @@ int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount) { const gpuarray_buffer_ops* ops = gpuarray_get_ops(name); if (ops == NULL) { - return error_set(&global_err, GA_INVALID_ERROR, "Invalid platform"); + return error_set(global_err, GA_INVALID_ERROR, "Invalid platform"); } return ops->get_device_count(platform, devcount); } @@ -121,8 +121,10 @@ int gpudata_transfer(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, /* Fallback to host copy */ tmp = malloc(sz); - if (tmp == NULL) - return error_sys(ctx->err, "malloc"); + if (tmp == NULL) { + error_sys(src_ctx->err, "malloc"); + return error_sys(dst_ctx->err, "malloc"); + } res = src_ctx->ops->buffer_read(tmp, src, srcoff, sz); if (res != GA_NO_ERROR) { free(tmp); diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 3681ca08d1..1fed77b91b 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -272,6 +272,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, size_t *_n, unsigned int *_nd, size_t **_dims, ssize_t ***_strides, int *_call32) { size_t n; + gpucontext *ctx = GpuKernel_context(&ge->k_contig); GpuArray *a = NULL, *v; unsigned int i, j, p, num_arrays = 0, nd = 0, nnd; int call32 = 1; @@ -443,7 +444,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, ktypes = calloc(p, sizeof(int)); if (ktypes == NULL) { - res = error_fmt(ctx->err, "calloc"); + res = error_sys(ctx->err, "calloc"); goto bail; } @@ -511,7 +512,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, strb_appends(&sb, "}\n}\n"); if (strb_error(&sb)) { - error_set(ctx->err, GA_MISC_ERROR, "Formatting error creating kernel source"); + res = error_set(ctx->err, GA_MISC_ERROR, "Formatting error creating kernel source"); goto bail; } diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c index f513285bf3..88b1cbaadf 100644 --- a/src/gpuarray_kernel.c +++ b/src/gpuarray_kernel.c @@ -2,6 +2,9 @@ #include "gpuarray/error.h" #include "gpuarray/types.h" +#include "util/error.h" +#include "private.h" + #include int GpuKernel_init(GpuKernel *k, gpucontext *ctx, unsigned int count, From 7049c6d77b462ffd18c4ee4a0f1f9ecbc923c3c2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Jun 2017 16:59:00 -0400 Subject: [PATCH 351/597] Have a little more faith in the C code for error messages. --- pygpu/gpuarray.pyx | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 34e39da107..ecdd5ed081 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -336,7 +336,7 @@ cdef int array_take1(GpuArray r, GpuArray a, GpuArray i, err = GpuArray_take1(&r.ga, &a.ga, &i.ga, check_err) if err != GA_NO_ERROR: if err == GA_VALUE_ERROR: - raise IndexError, "Index out of bounds" + raise IndexError, GpuArray_error(&r.ga, err) raise get_exc(err), GpuArray_error(&r.ga, err) cdef int array_setarray(GpuArray v, GpuArray a) except -1: @@ -1054,10 +1054,7 @@ cdef class GpuContext: self.kind = kind self.ctx = gpucontext_init(self.kind, devno, flags, &err) if (err != GA_NO_ERROR): - if err == GA_VALUE_ERROR: - raise get_exc(err), "No device %d"%(devno,) - else: - raise get_exc(err), gpucontext_error(NULL, err).decode('utf-8') + ": " + str(devno) + raise get_exc(err), gpucontext_error(NULL, err) def __enter__(self): if cuda_enter == NULL: @@ -1415,10 +1412,7 @@ cdef GpuArray pygpu_index(GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps): cdef GpuArray res res = new_GpuArray(type(a), a.context, a.base) - try: - array_index(res, a, starts, stops, steps) - except ValueError, e: - raise IndexError, "index out of bounds" + array_index(res, a, starts, stops, steps) return res cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, @@ -1430,7 +1424,7 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, return res cdef unsigned int caxis = compute_axis if caxis >= nd: - raise ValueError("You wanted us to compute the shape of a dimensions that don't exist") + raise ValueError("compute_axis is out of bounds") cdef size_t *cdims cdef size_t tot = 1 @@ -1554,7 +1548,7 @@ def open_ipc_handle(GpuContext c, bytes hpy, size_t l): d = cuda_open_ipc_handle(c.ctx, &h, l) if d is NULL: - raise GpuArrayException, "could not open handle" + raise GpuArrayException, gpucontext_error(c.ctx, 0) return d cdef class GpuArray: From 037ff240f986ecb45e97ca3ddbb13b603ee67289 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Jun 2017 11:21:16 -0400 Subject: [PATCH 352/597] Add more errors in elemwise. --- src/gpuarray_elemwise.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 1fed77b91b..261d46ca24 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -594,24 +594,33 @@ GpuElemwise *GpuElemwise_new(gpucontext *ctx, int ret; res = calloc(1, sizeof(*res)); - if (res == NULL) return NULL; + if (res == NULL) { + error_sys(ctx->err, "calloc"); + return NULL; + } res->flags = flags; res->nd = 8; res->n = n; res->expr = strdup(expr); - if (res->expr == NULL) + if (res->expr == NULL) { + error_sys(ctx->err, "strdup"); goto fail; + } if (preamble != NULL) { res->preamble = strdup(preamble); - if (res->preamble == NULL) + if (res->preamble == NULL) { + error_sys(ctx->err, "strdup"); goto fail; + } } res->args = copy_args(n, args); - if (res->args == NULL) + if (res->args == NULL) { + error_sys(ctx->err, "copy_args"); goto fail; + } /* Count the arrays in the arguements */ res->narray = 0; @@ -620,18 +629,26 @@ GpuElemwise *GpuElemwise_new(gpucontext *ctx, while (res->nd < nd) res->nd *= 2; res->dims = calloc(res->nd, sizeof(size_t)); - if (res->dims == NULL) + if (res->dims == NULL) { + error_sys(ctx->err, "calloc"); goto fail; + } res->strides = strides_array(res->narray, res->nd); - if (res->strides == NULL) + if (res->strides == NULL) { + error_sys(ctx->err, "strides_array"); goto fail; + } res->k_basic = calloc(res->nd, sizeof(GpuKernel)); - if (res->k_basic == NULL) + if (res->k_basic == NULL) { + error_sys(ctx->err, "calloc"); goto fail; + } res->k_basic_32 = calloc(res->nd, sizeof(GpuKernel)); - if (res->k_basic_32 == NULL) + if (res->k_basic_32 == NULL) { + error_sys(ctx->err, "calloc"); goto fail; + } ret = gen_elemwise_contig_kernel(&res->k_contig, ctx, #ifdef DEBUG From 6572caebbaee68a140a076a8e6dec5ef5777b936 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Jun 2017 11:45:12 -0400 Subject: [PATCH 353/597] Small fixup. --- src/gpuarray_buffer_cuda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 48112dd88c..06307f91b6 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -260,8 +260,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (vread_fn)kernel_read, res->err); if (res->disk_cache == NULL) { - // TODO use better error messages when they are available. - fprintf(stderr, "Error initializing disk cache, disabling\n"); + fprintf(stderr, "Error initializing disk cache, disabling: %s\n", + global_err->msg); cache_destroy(mem_cache); goto fail_disk_cache; } From e2f1f07accad1561db8177d85f119dfd3bcdfa2a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Jun 2017 12:14:22 -0400 Subject: [PATCH 354/597] Changes from review. --- src/gpuarray_array.c | 8 ++--- src/gpuarray_array_blas.c | 60 +++++++++++++++++++------------- src/gpuarray_array_collectives.c | 6 ++-- src/gpuarray_elemwise.c | 6 ++-- 4 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 127f5e1292..3af189d195 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -455,7 +455,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int addr32 = 0; if (!GpuArray_ISWRITEABLE(a)) - return error_set(ctx->err, GA_INVALID_ERROR, "Destination array (a) not writeable"); + return error_set(ctx->err, GA_VALUE_ERROR, "Destination array not writeable"); if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(i)) @@ -578,7 +578,7 @@ int GpuArray_setarray(GpuArray *a, const GpuArray *v) { "a->nd = %llu, v->nd = %llu", a->nd, v->nd); if (!GpuArray_ISWRITEABLE(a)) - return error_set(ctx->err, GA_INVALID_ERROR, "Destination array not writable"); + return error_set(ctx->err, GA_VALUE_ERROR, "Destination array not writable"); if (!GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(a)) return error_set(ctx->err, GA_UNALIGNED_ERROR, "One of the inputs is unaligned"); @@ -673,7 +673,7 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, newsize *= d; } - if (newsize != oldsize) return error_set(ctx->err, GA_INVALID_ERROR, "New shope differs in total size"); + if (newsize != oldsize) return error_set(ctx->err, GA_INVALID_ERROR, "New shape differs in total size"); /* If the source and desired layouts are the same, then just copy strides and dimensions */ @@ -1159,7 +1159,7 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) { default: free(buf); fprintf(fd, "\n", a->typecode); - return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Unsupported data type for dump"); + return error_fmt(ctx->err, GA_UNSUPPORTED_ERROR, "Unsupported data type for dump: %d", a->typecode); } s -= gpuarray_get_elsize(a->typecode); p += gpuarray_get_elsize(a->typecode); diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 53681da29d..9cfdb58af0 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -24,11 +24,12 @@ int GpuArray_rdot(GpuArray *X, GpuArray *Y, X->typecode != GA_DOUBLE) return error_set(ctx->err, GA_INVALID_ERROR, "Data type not supported"); - if (X->nd != 1 || Y->nd != 1 || Z->nd != 0 || - X->typecode != Y->typecode || X->typecode != Z->typecode) + if (X->nd != 1 || Y->nd != 1 || Z->nd != 0) return error_fmt(ctx->err, GA_VALUE_ERROR, - "Wrong number of dimensions: X->nd = %d (expected 1), Y->nd = %d (expected 1), Z->nd = %d (expected 0)", + "Wrong number of dimensions: X->nd = %u (expected 1), Y->nd = %u (expected 1), Z->nd = %u (expected 0)", X->nd, Y->nd, Z->nd); + if (X->typecode != Y->typecode || X->typecode != Z->typecode) + error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); n = X->dimensions[0]; if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(Z->flags & GA_ALIGNED)) @@ -111,11 +112,14 @@ int GpuArray_rgemv(cb_transpose transA, double alpha, GpuArray *A, if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) - return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported data type"); + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); - if (A->nd != 2 || X->nd != 1 || Y->nd != 1 || - X->typecode != A->typecode || Y->typecode != A->typecode) - return error_set(ctx->err, GA_VALUE_ERROR, "Bad shape or inconsistent types"); + if (A->nd != 2 || X->nd != 1 || Y->nd != 1) + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Wrong number of dimensions: A->nd = %u (expected 2), X->nd = %u (expected 1), Y->nd = %u (expected 1)", + A->nd, X->nd, Y->nd); + if (X->typecode != A->typecode || Y->typecode != A->typecode) + return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(A->flags & GA_ALIGNED) || !(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED)) @@ -213,12 +217,14 @@ int GpuArray_rgemm(cb_transpose transA, cb_transpose transB, double alpha, if (A->typecode != GA_HALF && A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) - return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported type"); + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); - if (A->nd != 2 || B->nd != 2 || C->nd != 2 || - B->typecode != A->typecode || - C->typecode != A->typecode) - return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent nd or types"); + if (A->nd != 2 || B->nd != 2 || C->nd != 2) + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Wrong number of dimensions: A->nd = %u (expected 2), B->nd = %u (expected 2), C->nd = %u (expected 2)", + A->nd, B->nd, C->nd); + if (B->typecode != A->typecode || C->typecode != A->typecode) + return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) || !(C->flags & GA_ALIGNED)) @@ -363,12 +369,14 @@ int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GpuArray *A, if (X->typecode != GA_HALF && X->typecode != GA_FLOAT && X->typecode != GA_DOUBLE) - return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported type"); + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); - if (X->nd != 1 || Y->nd != 1 || A->nd != 2 || - Y->typecode != X->typecode || - A->typecode != X->typecode) - return error_set(ctx->err, GA_VALUE_ERROR, "Invalid dims or inconsistent types"); + if (X->nd != 1 || Y->nd != 1 || A->nd != 2) + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Wrong number of dimensions: X->nd = %u (expected 1), Y->nd = %u (expected 1), A->nd = %u (expected 2)", + X->nd, Y->nd, A->nd); + if (Y->typecode != X->typecode || A->typecode != X->typecode) + return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(X->flags & GA_ALIGNED) || !(Y->flags & GA_ALIGNED) || !(A->flags & GA_ALIGNED)) @@ -479,12 +487,14 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph size_t i; if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) - return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported type"); + return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); - if (A->nd != 3 || B->nd != 3 || C->nd != 3 || - B->typecode != A->typecode || - C->typecode != A->typecode) - return error_set(ctx->err, GA_VALUE_ERROR, "Invalid dims or inconsistent types"); + if (A->nd != 3 || B->nd != 3 || C->nd != 3) + return error_fmt(ctx->err, GA_VALUE_ERROR, + "Wrong number of dimensions: A->nd = %u (expected 3), B->nd = %u (expected 3), C->nd = %u (expected 3)", + A->nd, B->nd, C->nd); + if (B->typecode != A->typecode || C->typecode != A->typecode) + return error_set(ctx->err, GA_VALUE_ERROR, "Inconsistent dtypes"); if (!(A->flags & GA_ALIGNED) || !(B->flags & GA_ALIGNED) || !(C->flags & GA_ALIGNED)) @@ -558,7 +568,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph ? Cp->strides[1] / elsize : Cp->dimensions[2]; } else { - err = error_set(ctx->err, GA_VALUE_ERROR, "Invalid internal result for C"); + err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for C"); goto cleanup; } if (cA == 2) { @@ -582,7 +592,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph transA = cb_no_trans; } } else { - err = error_set(ctx->err, GA_VALUE_ERROR, "Invalid internal result for A"); + err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for A"); goto cleanup; } if (cB == 2) { @@ -606,7 +616,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph transB = cb_no_trans; } } else { - err = error_set(ctx->err, GA_VALUE_ERROR, "Invalid internal result for B"); + err = error_set(ctx->err, GA_MISC_ERROR, "Invalid internal result for B"); goto cleanup; } diff --git a/src/gpuarray_array_collectives.c b/src/gpuarray_array_collectives.c index c53249bbd3..52d243da5a 100644 --- a/src/gpuarray_array_collectives.c +++ b/src/gpuarray_array_collectives.c @@ -37,8 +37,10 @@ static inline int check_gpuarrays(int times_src, const GpuArray* src, return error_set(ctx->err, GA_VALUE_ERROR, "Size mismatch for transfer"); if (src->typecode != dest->typecode) return error_set(ctx->err, GA_VALUE_ERROR, "Type mismatch"); - if (!GpuArray_ISALIGNED(src) || !GpuArray_CHKFLAGS(dest, GA_BEHAVED)) - return error_set(ctx->err, GA_UNALIGNED_ERROR, "Misbehaved arrays"); + if (!GpuArray_ISALIGNED(src) || !GpuArray_ISALIGNED(dest)) + return error_set(ctx->err, GA_UNALIGNED_ERROR, "Unaligned arrays"); + if (!GpuArray_ISWRITEABLE(dest)) + return error_set(ctx->err, GA_INVALID_ERROR, "Unwritable destination"); if (times_src >= times_dest) *count = count_src; diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 261d46ca24..dc1d837d3b 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -330,7 +330,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, /* We can't broadcast outputs */ if (ISCLR(flags, GE_BROADCAST) || is_output(ge->args[i]) || v->dimensions[j] != 1) { - return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u", j, i); + return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u (expected %" SPREFIX "u got %" SPREFIX "u)", j, i, ge->dims[j], v->dimensions[j]); } } /* If the dimension is 1 set the strides to 0 regardless since @@ -543,10 +543,10 @@ static int check_contig(GpuElemwise *ge, void **args, f_contig &= GpuArray_IS_F_CONTIGUOUS(v); if (a != v) { if (a->nd != v->nd) - return error_set(ctx->err, GA_INVALID_ERROR, "Mismatched nd"); + return error_fmt(ctx->err, GA_INVALID_ERROR, "Mismatched nd for input %u (expected %u, got %u)", i, a->nd, v->nd); for (j = 0; j < a->nd; j++) { if (v->dimensions[j] != a->dimensions[j]) - return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u", j); + return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u (expected %" SPREFIX "u, got %" SPREFIX "u)", j, a->dimensions[j], v->dimensions[j]); } } } From 9c2e317646b885fe28e1f5da1be50e4b6156f2bd Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 12 May 2017 15:15:47 -0400 Subject: [PATCH 355/597] First version that execute, but give wrong result! --- src/gpuarray/blas.h | 1 + src/gpuarray/buffer_blas.h | 9 ++++ src/gpuarray_array_blas.c | 17 +++++- src/gpuarray_blas_cuda_cublas.c | 86 +++++++++++++++++++++++++++++- src/gpuarray_blas_opencl_clblas.c | 1 + src/gpuarray_blas_opencl_clblast.c | 1 + src/gpuarray_buffer_blas.c | 23 +++++++- src/loaders/libcublas.fn | 2 + src/loaders/libcublas.h | 5 ++ src/private.h | 7 +++ 10 files changed, 149 insertions(+), 3 deletions(-) diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index a59d3bb885..f76b209a97 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -34,6 +34,7 @@ GPUARRAY_PUBLIC int GpuArray_rger(double alpha, GpuArray *X, GpuArray *Y, GPUARRAY_PUBLIC int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alpha, GpuArray *A, GpuArray *B, double beta, GpuArray *C, int nocopy); +#define GpuArray_hgemmBatch_3d GpuArray_rgemmBatch_3d #define GpuArray_sgemmBatch_3d GpuArray_rgemmBatch_3d #define GpuArray_dgemmBatch_3d GpuArray_rgemmBatch_3d diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 859ede62b7..af4c921e37 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -115,6 +115,15 @@ GPUARRAY_PUBLIC int gpublas_hgemmBatch( float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags); +//TODO: float should be half +GPUARRAY_PUBLIC int gpublas_hgemmStridedBatch( + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount, int flags); + GPUARRAY_PUBLIC int gpublas_sgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 9cfdb58af0..d8e10b7a88 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -486,7 +486,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL; size_t i; - if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE) + if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE && A->typecode != GA_HALF) return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); if (A->nd != 3 || B->nd != 3 || C->nd != 3) @@ -625,6 +625,21 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph if (err != GA_NO_ERROR) goto cleanup; + if(C->typecode == GA_HALF){ + //TODO: handle offset + assert (Ap->offset == 0); + assert (Bp->offset == 0); + assert (Cp->offset == 0); + //TODO: float should be half + err = gpublas_hgemmStridedBatch(o, transA, transB, m, n, k, alpha, + Ap->data, lda, Ap->strides[0]/elsize, + Bp->data, ldb, Bp->strides[0]/elsize, + beta, + Cp->data, ldc, Cp->strides[0]/elsize, + batchCount, 0); + goto cleanup; + } + A_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); B_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); C_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 3d4cdf16bf..413b079687 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -510,6 +510,89 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, cuda_exit(ctx); return GA_NO_ERROR; } +//TODO: change float to half +static int hgemmStridedBatch(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount) { + cuda_context *ctx; + blas_handle *h; + size_t t; + ssize_t lt; + gpudata *T; + cb_transpose transT; + cublasStatus_t err; + __half halpha, hbeta; + + //ignore overflow, underflow, denormalized and inf values. Mayve also nan. + uint32_t x = (uint32_t)alpha; + alpha = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff); + x = (uint32_t)beta; + beta = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff); + + ASSERT_BUF(A); + if (cublasHgemmStridedBatched == NULL) + return GA_DEVSUP_ERROR; + + ctx = A->ctx; + // TODO: stride* are long long int in cuda, LARGE_VAL check for int. + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(strideA) || LARGE_VAL(strideB) || LARGE_VAL(strideC) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); + + h = (blas_handle *)ctx->blas_handle; + cuda_enter(ctx); + + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + transT = transA; + transA = transB; + transB = transT; + lt = strideA; + strideA = strideB; + strideB = lt; + } + + ASSERT_BUF(A); + ASSERT_BUF(B); + ASSERT_BUF(C); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); + raise(SIGINT); + err = cublasHgemmStridedBatched(h->h, + convT(transA), convT(transB), + M, N, K, &halpha, + (__half *)(A->ptr), (int) lda, strideA, + (__half *)(B->ptr), (int) ldb, strideB, + &hbeta, + (__half *)(C->ptr), (int) ldc, strideB, + batchCount); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + return error_cublas(ctx->err, "cublasHgemmStridedBatched", err); + } + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); + + cuda_exit(ctx); + return GA_NO_ERROR; +} static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, @@ -1578,5 +1661,6 @@ gpuarray_blas_ops cublas_ops = { dgemvBatch, NULL, /* hgerBatch */ sgerBatch, - dgerBatch + dgerBatch, + hgemmStridedBatch, }; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index ab0e020ffd..926356b985 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -449,4 +449,5 @@ gpuarray_blas_ops clblas_ops = { NULL, /* hgerBatch */ NULL, /* sgerBatch */ NULL, /* dgerBatch */ + NULL, /* hgemmStridedzBatch */ }; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 13fca18190..debe5e1e70 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -524,4 +524,5 @@ gpuarray_blas_ops clblast_ops = { NULL, /* hgerBatch */ NULL, /* sgerBatch */ NULL, /* dgerBatch */ + NULL, /* hgemmStridedzBatch */ }; diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 595ac8d5a4..23b6debcfb 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -17,13 +17,21 @@ const char *gpublas_error(gpucontext *ctx) { return ctx->err->msg; } -#define BLAS_OP(buf,name, args) \ +#define BLAS_OP(buf, name, args) \ gpucontext *ctx = gpudata_context(buf); \ if (ctx->blas_ops->name) \ return ctx->blas_ops->name args; \ else \ return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by device or missing library: %s", #name) +#define BLAS_OPF(buf, name, args) \ + gpucontext *ctx = gpudata_context(buf); \ + if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \ + if (ctx->blas_ops->name) \ + return ctx->blas_ops->name args; \ + else \ + return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by device or missing library: %s", #name) + int gpublas_hdot( size_t N, @@ -161,6 +169,19 @@ int gpublas_hgemmBatch( B, offB, ldb, beta, C, offC, ldc, batchCount)); } +//TODO: use half and not float here. +int gpublas_hgemmStridedBatch( + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount, int flags) { + BLAS_OPF(A, hgemmStridedBatch, + (order, transA, transB, M, N, K, alpha, A, lda, strideA, + B, ldb, strideB, beta, C, ldc, strideC, batchCount)); +} + int gpublas_sgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index c0dbddf41e..31eb43b573 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -23,3 +23,5 @@ DEF_PROC_OPT(cublasSgemmEx, (cublasHandle_t handle, cublasOperation_t transa, cu DEF_PROC(cublasSgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, const float *beta, float *Carray[], int ldc, int batchCount)); DEF_PROC(cublasDgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, const double *beta, double *Carray[], int ldc, int batchCount)); + +DEF_PROC(cublasHgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half *alpha, const __half *A, int lda, long long int strideA, const __half *B, int ldb, long long int strideB, const __half *beta, __half *C, int ldc, long long int strideC, int batchCount)); diff --git a/src/loaders/libcublas.h b/src/loaders/libcublas.h index a0cf9e5084..f6f46963dc 100644 --- a/src/loaders/libcublas.h +++ b/src/loaders/libcublas.h @@ -2,6 +2,11 @@ #define LOADER_LIBCUBLAS_H #include "util/error.h" +//TODO: how to have it work with align? +typedef struct {//__align__(2) { + unsigned short x; +} __half; + /** @cond NEVER */ diff --git a/src/private.h b/src/private.h index 4286d3fd21..b81c7b6906 100644 --- a/src/private.h +++ b/src/private.h @@ -214,6 +214,13 @@ struct _gpuarray_blas_ops { gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); + //TODO: float should be half + int (*hgemmStridedBatch)(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount); }; struct _gpuarray_comm_ops { From 53a25f6670407068a839cbd57c58e7633496cf56 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 20 Jul 2016 18:50:18 -0400 Subject: [PATCH 356/597] Add conversion function from float to half on the host. --- src/gpuarray/util.h | 84 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/src/gpuarray/util.h b/src/gpuarray/util.h index 04761ab37c..988aee9083 100644 --- a/src/gpuarray/util.h +++ b/src/gpuarray/util.h @@ -98,6 +98,90 @@ GPUARRAY_PUBLIC void gpuarray_elemwise_collapse(unsigned int n, unsigned int *nd, size_t *dim, ssize_t **strs); + +typedef uint16_t ga_half_t; + +/* code strongly inspired from + https://github.com/numpy/numpy/blob/master/numpy/core/src/npymath/halffloat.c#L246 */ + +static inline ga_half_t ga_float2half(float f) { + union { + float f; + uint32_t bits; + } bf; + union { + ga_half_t h; + uint16_t bits; + } bh; + + uint32_t f_exp, f_sig; + uint16_t h_sgn, h_exp, h_sig; + + bf.f = f; + + h_sgn = (bf.bits&0x80000000u) >> 16; + f_exp = (bf.bits&0x7f800000u); + + /* Exponent overflow/NaN converts to signed inf/NaN */ + if (f_exp >= 0x47800000u) { + if (f_exp == 0x7f800000u) { + /* Inf or NaN */ + f_sig = (bf.bits&0x007fffffu); + if (f_sig != 0) { + /* NaN - propagate the flag in the significand... */ + bh.bits = (uint16_t) (0x7c00u + (f_sig >> 13)); + /* ...but make sure it stays a NaN */ + if (bh.bits == 0x7c00u) { + bh.bits++; + } + bh.bits += h_sgn; + return bh.h; + } else { + /* signed inf */ + bh.bits = h_sgn + 0x7c00u; + return bh.h; + } + } else { + bh.bits = h_sgn + 0x7c00u; + return bh.h; + } + } + + if (f_exp <= 0x38000000u) { + /* + * Signed zeros, subnormal floats, and floats with small + * exponents all convert to signed zero halfs. + */ + if (f_exp < 0x33000000u) { + bh.bits = h_sgn; + return bh.h; + } + /* Make the subnormal significand */ + f_exp >>= 23; + f_sig = (0x00800000u + (bf.bits&0x007fffffu)); + f_sig >>= (113 - f_exp); + /* Handle rounding by adding 1 to the bit beyond half precision */ + f_sig += 0x00001000u; + h_sig = (uint16_t) (f_sig >> 13); + /* + * If the rounding causes a bit to spill into h_exp, it will + * increment h_exp from zero to one and h_sig will be zero. + * This is the correct result. + */ + bh.bits = h_sgn + h_sig; + return bh.h; + } + + /* Regular case with no overflow or underflow */ + h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13); + /* Handle rounding by adding 1 to the bit beyond half precision */ + f_sig = (bf.bits&0x007fffffu); + f_sig += 0x00001000u; + h_sig = (uint16_t) (f_sig >> 13); + bh.bits = h_sgn + h_exp + h_sig; + return bh.h; +} + #ifdef __cplusplus } #endif From ed74310315da0b11372eeac41f0fb7106cffd613 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 21 Jul 2016 14:16:06 -0400 Subject: [PATCH 357/597] Add test for conversion function. --- tests/check_util.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/check_util.c b/tests/check_util.c index 76885f8e8c..b534421308 100644 --- a/tests/check_util.c +++ b/tests/check_util.c @@ -118,12 +118,42 @@ START_TEST(test_elemwise_collapse) { } END_TEST +START_TEST(test_float2half) { + const float f[] = { + 2.9831426e-08f, + 2e-25f, + 2e-26f, + 1.0005035f, + 1.0002441f, + 65519.f, + 65520.f, + }; + const ga_half_t h[] = { + 0x0001u, /* 2e-24 */ + 0x0000u, /* 0 */ + 0x0000u, /* 0 */ + 0x3c01u, /* 1.0 + 2e-10 */ + 0x3c00u, /* 1.0 */ + 0x7bffu, /* 65504 */ + 0x7c00u, /* Inf */ + }; + unsigned int i; + ga_half_t hr; + + for (i = 0; i < sizeof(f)/sizeof(f[0]); i++) { + hr = ga_float2half(f[i]); + ck_assert_int_eq(hr, h[i]); + } +} +END_TEST + Suite *get_suite(void) { Suite *s = suite_create("util"); TCase *tc = tcase_create("All"); tcase_add_test(tc, test_register_type); tcase_add_test(tc, test_type_flags); tcase_add_test(tc, test_elemwise_collapse); + tcase_add_test(tc, test_float2half); suite_add_tcase(s, tc); return s; } From ddb016b801558a5d6d93f2e8c6a18b20c0a4f7cc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 5 Jun 2017 17:10:48 -0400 Subject: [PATCH 358/597] Add in functions for 3d batch gemm for all dtypes and make them optional. --- src/loaders/libcublas.fn | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index 31eb43b573..c181c9f9b1 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -24,4 +24,6 @@ DEF_PROC_OPT(cublasSgemmEx, (cublasHandle_t handle, cublasOperation_t transa, cu DEF_PROC(cublasSgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, const float *beta, float *Carray[], int ldc, int batchCount)); DEF_PROC(cublasDgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, const double *beta, double *Carray[], int ldc, int batchCount)); -DEF_PROC(cublasHgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half *alpha, const __half *A, int lda, long long int strideA, const __half *B, int ldb, long long int strideB, const __half *beta, __half *C, int ldc, long long int strideC, int batchCount)); +DEF_PROC_OPT(cublasHgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const __half *alpha, const __half *A, int lda, long long int strideA, const __half *B, int ldb, long long int strideB, const __half *beta, __half *C, int ldc, long long int strideC, int batchCount)); +DEF_PROC_OPT(cublasSgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, long long int strideA, const float *B, int ldb, long long int strideB, const float *beta, float *C, int ldc, long long int strideC, int batchCount)); +DEF_PROC_OPT(cublasDgemmStridedBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, long long int strideA, const double *B, int ldb, long long int strideB, const double *beta, double *C, int ldc, long long int strideC, int batchCount)); From 0767181037067a362a95e5627585ca80a39d3485 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 5 Jun 2017 17:39:50 -0400 Subject: [PATCH 359/597] Add gemm3D for batch gemm of 3d matrices. --- src/gpuarray/buffer_blas.h | 19 ++- src/gpuarray_blas_cuda_cublas.c | 216 +++++++++++++++++++++++++---- src/gpuarray_blas_opencl_clblas.c | 4 +- src/gpuarray_blas_opencl_clblast.c | 4 +- src/gpuarray_buffer_blas.c | 50 +++++-- src/private.h | 25 +++- 6 files changed, 264 insertions(+), 54 deletions(-) diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index af4c921e37..5a7e8ba483 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -115,8 +115,7 @@ GPUARRAY_PUBLIC int gpublas_hgemmBatch( float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount, int flags); -//TODO: float should be half -GPUARRAY_PUBLIC int gpublas_hgemmStridedBatch( +GPUARRAY_PUBLIC int gpublas_hgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t lda, ssize_t strideA, @@ -124,6 +123,22 @@ GPUARRAY_PUBLIC int gpublas_hgemmStridedBatch( float beta, gpudata *C, size_t ldc, ssize_t strideC, size_t batchCount, int flags); +GPUARRAY_PUBLIC int gpublas_sgemm3D( + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount, int flags); + +GPUARRAY_PUBLIC int gpublas_dgemm3D( + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, double alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + double beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount, int flags); + GPUARRAY_PUBLIC int gpublas_sgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 413b079687..b71b756822 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -510,37 +510,33 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, cuda_exit(ctx); return GA_NO_ERROR; } -//TODO: change float to half -static int hgemmStridedBatch(cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, - size_t batchCount) { + +static int hgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, + size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t t; - ssize_t lt; + ssize_t st; gpudata *T; cb_transpose transT; cublasStatus_t err; - __half halpha, hbeta; - - //ignore overflow, underflow, denormalized and inf values. Mayve also nan. - uint32_t x = (uint32_t)alpha; - alpha = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff); - x = (uint32_t)beta; - beta = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff); - + ga_half_t halpha, hbeta; + ASSERT_BUF(A); - if (cublasHgemmStridedBatched == NULL) - return GA_DEVSUP_ERROR; + ASSERT_BUF(B); + ASSERT_BUF(C); ctx = A->ctx; - // TODO: stride* are long long int in cuda, LARGE_VAL check for int. + + if (cublasHgemmStridedBatched == NULL) + return error_set(ctx->error, GA_DEVSUP_ERROR, "cublasHgemmStridedBatched not available in your version of cuBLAS"); + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || - LARGE_VAL(strideA) || LARGE_VAL(strideB) || LARGE_VAL(strideC) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); @@ -558,28 +554,108 @@ static int hgemmStridedBatch(cb_order order, cb_transpose transA, cb_transpose t t = lda; lda = ldb; ldb = t; + t = offA; + offA = offB; + offB = t; transT = transA; transA = transB; transB = transT; - lt = strideA; + st = strideA; strideA = strideB; - strideB = lt; + strideB = st; } - ASSERT_BUF(A); - ASSERT_BUF(B); - ASSERT_BUF(C); + halpha = ga_float2half(alpha); + hbeta = ga_float2half(beta); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); - raise(SIGINT); err = cublasHgemmStridedBatched(h->h, convT(transA), convT(transB), M, N, K, &halpha, - (__half *)(A->ptr), (int) lda, strideA, - (__half *)(B->ptr), (int) ldb, strideB, + ((__half *)A->ptr) + offA, lda, strideA, + ((__half *)B->ptr) + offB, ldb, strideB, &hbeta, - (__half *)(C->ptr), (int) ldc, strideB, + ((__half *)C->ptr) + offC, ldc, strideB, + batchCount); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + return error_cublas(ctx->err, "cublasHgemmStridedBatched", err); + } + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); + + cuda_exit(ctx); + return GA_NO_ERROR; +} + +static int sgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, + size_t batchCount) { + cuda_context *ctx; + blas_handle *h; + size_t t; + ssize_t st; + gpudata *T; + cb_transpose transT; + cublasStatus_t err; + + ASSERT_BUF(A); + ASSERT_BUF(B); + ASSERT_BUF(C); + + ctx = A->ctx; + + if (cublasSgemmStridedBatched == NULL) + return error_set(ctx->error, GA_DEVSUP_ERROR, "cublasSgemmStridedBatched not available in your version of cuBLAS"); + + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); + + h = (blas_handle *)ctx->blas_handle; + cuda_enter(ctx); + + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + t = offA; + offA = offB; + offB = t; + transT = transA; + transA = transB; + transB = transT; + st = strideA; + strideA = strideB; + strideB = st; + } + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); + + err = cublasSgemmStridedBatched(h->h, + convT(transA), convT(transB), + M, N, K, &alpha, + ((float *)A->ptr) + offA, (int)lda, strideA, + ((float *)B->ptr) + offB, (int)ldb, strideB, + &beta, + ((float *)C->ptr) + offC, (int)ldc, strideB, batchCount); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); @@ -594,6 +670,84 @@ static int hgemmStridedBatch(cb_order order, cb_transpose transA, cb_transpose t return GA_NO_ERROR; } +static int dgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, double alpha, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, + size_t batchCount) { + cuda_context *ctx; + blas_handle *h; + size_t t; + ssize_t st; + gpudata *T; + cb_transpose transT; + cublasStatus_t err; + + ASSERT_BUF(A); + ASSERT_BUF(B); + ASSERT_BUF(C); + + ctx = A->ctx; + + if (cublasDgemmStridedBatched == NULL) + return error_set(ctx->error, GA_DEVSUP_ERROR, "cublasDgemmStridedBatched not available in your version of cuBLAS"); + + if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || + LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || + LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) + return error_set(ctx->err, GA_XLARGE_ERROR, "Passed-in sizes would overflow the ints in the cublas interface"); + + h = (blas_handle *)ctx->blas_handle; + cuda_enter(ctx); + + if (order == cb_c) { + /* swap A and B */ + t = N; + N = M; + M = t; + T = A; + A = B; + B = T; + t = lda; + lda = ldb; + ldb = t; + t = offA; + offA = offB; + offB = t; + transT = transA; + transA = transB; + transB = transT; + st = strideA; + strideA = strideB; + strideB = st; + } + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); + + err = cublasDgemmStridedBatched(h->h, + convT(transA), convT(transB), + M, N, K, &alpha, + ((double *)A->ptr) + offA, (int)lda, strideA, + ((double *)B->ptr) + offB, (int)ldb, strideB, + &beta, + ((double *)C->ptr) + offC, (int)ldc, strideB, + batchCount); + if (err != CUBLAS_STATUS_SUCCESS) { + cuda_exit(ctx); + return error_cublas(ctx->err, "cublasDgemmStridedBatched", err); + } + + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); + + cuda_exit(ctx); + return GA_NO_ERROR; +} + static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, @@ -1662,5 +1816,7 @@ gpuarray_blas_ops cublas_ops = { NULL, /* hgerBatch */ sgerBatch, dgerBatch, - hgemmStridedBatch, + hgemm3D, + sgemm3D, + dgemm3D }; diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 926356b985..e1a7128b13 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -449,5 +449,7 @@ gpuarray_blas_ops clblas_ops = { NULL, /* hgerBatch */ NULL, /* sgerBatch */ NULL, /* dgerBatch */ - NULL, /* hgemmStridedzBatch */ + NULL, /* hgemm3D */ + NULL, /* sgemm3D */ + NULL, /* dgemm3D */ }; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index debe5e1e70..8bd056e61e 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -524,5 +524,7 @@ gpuarray_blas_ops clblast_ops = { NULL, /* hgerBatch */ NULL, /* sgerBatch */ NULL, /* dgerBatch */ - NULL, /* hgemmStridedzBatch */ + NULL, /* hgemm3D */ + NULL, /* sgemm3D */ + NULL, /* dgemm3D */ }; diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 23b6debcfb..425b0bdd12 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -169,19 +169,6 @@ int gpublas_hgemmBatch( B, offB, ldb, beta, C, offC, ldc, batchCount)); } -//TODO: use half and not float here. -int gpublas_hgemmStridedBatch( - cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, - size_t batchCount, int flags) { - BLAS_OPF(A, hgemmStridedBatch, - (order, transA, transB, M, N, K, alpha, A, lda, strideA, - B, ldb, strideB, beta, C, ldc, strideC, batchCount)); -} - int gpublas_sgemmBatch( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, @@ -271,3 +258,40 @@ int gpublas_dgerBatch(cb_order order, size_t M, size_t N, double alpha, (order, M, N, alpha, x, offX, incX, y, offY, incY, A, offA, lda, batchCount, flags)); } + + +int gpublas_hgemm3d( + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount, int flags) { + BLAS_OPBF(A, hgemm3d, + (order, transA, transB, M, N, K, alpha, A, lda, strideA, + B, ldb, strideB, beta, C, ldc, strideC, batchCount)); +} + +int gpublas_sgemm3d( + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount, int flags) { + BLAS_OPBF(A, sgemm3d, + (order, transA, transB, M, N, K, alpha, A, lda, strideA, + B, ldb, strideB, beta, C, ldc, strideC, batchCount)); +} + +int gpublas_dgemm3d( + cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount, int flags) { + BLAS_OPBF(A, dgemm3d, + (order, transA, transB, M, N, K, alpha, A, lda, strideA, + B, ldb, strideB, beta, C, ldc, strideC, batchCount)); +} diff --git a/src/private.h b/src/private.h index b81c7b6906..555c178753 100644 --- a/src/private.h +++ b/src/private.h @@ -214,13 +214,24 @@ struct _gpuarray_blas_ops { gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); - //TODO: float should be half - int (*hgemmStridedBatch)(cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, - size_t batchCount); + int (*hgemm3d)(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount); + int (*sgemm3d)(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, float alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount); + int (*dgemm3d)(cb_order order, cb_transpose transA, cb_transpose transB, + size_t M, size_t N, size_t K, double alpha, + gpudata *A, size_t lda, ssize_t strideA, + gpudata *B, size_t ldb, ssize_t strideB, + double beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t batchCount); }; struct _gpuarray_comm_ops { From cb4a79f0e22ffc4883aa450a29b7d04ed54ec006 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 5 Jun 2017 17:49:13 -0400 Subject: [PATCH 360/597] Change GpuArray_rgemmBatch_3d to use the new gemm3d functions. --- src/gpuarray_array_blas.c | 67 +++++++++++---------------------------- 1 file changed, 18 insertions(+), 49 deletions(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index d8e10b7a88..78f009a7c3 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -625,65 +625,34 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph if (err != GA_NO_ERROR) goto cleanup; - if(C->typecode == GA_HALF){ - //TODO: handle offset - assert (Ap->offset == 0); - assert (Bp->offset == 0); - assert (Cp->offset == 0); - //TODO: float should be half - err = gpublas_hgemmStridedBatch(o, transA, transB, m, n, k, alpha, - Ap->data, lda, Ap->strides[0]/elsize, - Bp->data, ldb, Bp->strides[0]/elsize, - beta, - Cp->data, ldc, Cp->strides[0]/elsize, - batchCount, 0); - goto cleanup; - } - - A_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); - B_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); - C_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); - - A_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); - B_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); - C_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); - - for (i = 0; i < batchCount; i++) { - A_datas[i] = Ap->data; - B_datas[i] = Bp->data; - C_datas[i] = Cp->data; - A_offsets[i] = (Ap->offset + i * Ap->strides[0]) / elsize; - B_offsets[i] = (Bp->offset + i * Bp->strides[0]) / elsize; - C_offsets[i] = (Cp->offset + i * Cp->strides[0]) / elsize; - } - switch (C->typecode) { case GA_HALF: - err = gpublas_hgemmBatch(o, transA, transB, m, n, k, (float)alpha, - A_datas, A_offsets, lda, - B_datas, B_offsets, ldb, - (float)beta, - C_datas, C_offsets, ldc, batchCount, 0); + err = gpublas_hgemm3d(o, transA, transB, m, n, k, (float)alpha, + Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, + Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, + (float)beta, + Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize, + batchCount, 0); break; case GA_FLOAT: - err = gpublas_sgemmBatch(o, transA, transB, m, n, k, (float)alpha, - A_datas, A_offsets, lda, - B_datas, B_offsets, ldb, - (float)beta, - C_datas, C_offsets, ldc, batchCount, 0); + err = gpublas_sgemm3d(o, transA, transB, m, n, k, (float)alpha, + Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, + Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, + (float)beta, + Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize, + batchCount, 0); break; case GA_DOUBLE: - err = gpublas_dgemmBatch(o, transA, transB, m, n, k, (double)alpha, - A_datas, A_offsets, lda, - B_datas, B_offsets, ldb, - (double)beta, - C_datas, C_offsets, ldc, batchCount, 0); + err = gpublas_dgemm3d(o, transA, transB, m, n, k, (double)alpha, + Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, + Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, + (double)beta, + Cp->data, Cp->offset/elsize, ldc, Cp->strides[0]/elsize, + batchCount, 0); break; } cleanup: - free(A_datas); free(B_datas); free(C_datas); - free(A_offsets); free(B_offsets); free(C_offsets); if (Ap == ©A) GpuArray_clear(©A); if (Bp == ©B) From 0b91bdf6118f4fc99dbcebe3d954453dc8d935c3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 5 Jun 2017 18:37:55 -0400 Subject: [PATCH 361/597] Clean up problems. --- src/gpuarray/buffer_blas.h | 18 +++++------ src/gpuarray_array_blas.c | 9 ++---- src/gpuarray_blas_cuda_cublas.c | 16 +++++----- src/gpuarray_buffer_blas.c | 54 +++++++++++++++++++-------------- src/private.h | 24 +++++++-------- 5 files changed, 64 insertions(+), 57 deletions(-) diff --git a/src/gpuarray/buffer_blas.h b/src/gpuarray/buffer_blas.h index 5a7e8ba483..f7af64c478 100644 --- a/src/gpuarray/buffer_blas.h +++ b/src/gpuarray/buffer_blas.h @@ -118,25 +118,25 @@ GPUARRAY_PUBLIC int gpublas_hgemmBatch( GPUARRAY_PUBLIC int gpublas_hgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_sgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_dgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - double beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags); GPUARRAY_PUBLIC int gpublas_sgemmBatch( diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 78f009a7c3..01d96a1598 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -482,9 +482,6 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph cb_order o; int cA, cB, cC; int err; - gpudata **A_datas = NULL, **B_datas = NULL, **C_datas = NULL; - size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL; - size_t i; if (A->typecode != GA_FLOAT && A->typecode != GA_DOUBLE && A->typecode != GA_HALF) return error_set(ctx->err, GA_INVALID_ERROR, "Unsupported dtype"); @@ -627,7 +624,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph switch (C->typecode) { case GA_HALF: - err = gpublas_hgemm3d(o, transA, transB, m, n, k, (float)alpha, + err = gpublas_hgemm3D(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, (float)beta, @@ -635,7 +632,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph batchCount, 0); break; case GA_FLOAT: - err = gpublas_sgemm3d(o, transA, transB, m, n, k, (float)alpha, + err = gpublas_sgemm3D(o, transA, transB, m, n, k, (float)alpha, Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, (float)beta, @@ -643,7 +640,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph batchCount, 0); break; case GA_DOUBLE: - err = gpublas_dgemm3d(o, transA, transB, m, n, k, (double)alpha, + err = gpublas_dgemm3D(o, transA, transB, m, n, k, (double)alpha, Ap->data, Ap->offset/elsize, lda, Ap->strides[0]/elsize, Bp->data, Bp->offset/elsize, ldb, Bp->strides[0]/elsize, (double)beta, diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index b71b756822..4bf0f4762a 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -533,7 +533,7 @@ static int hgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, ctx = A->ctx; if (cublasHgemmStridedBatched == NULL) - return error_set(ctx->error, GA_DEVSUP_ERROR, "cublasHgemmStridedBatched not available in your version of cuBLAS"); + return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasHgemmStridedBatched not available in your version of cuBLAS"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || @@ -573,11 +573,11 @@ static int hgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); err = cublasHgemmStridedBatched(h->h, convT(transA), convT(transB), - M, N, K, &halpha, + M, N, K, (__half *)&halpha, ((__half *)A->ptr) + offA, lda, strideA, ((__half *)B->ptr) + offB, ldb, strideB, - &hbeta, - ((__half *)C->ptr) + offC, ldc, strideB, + (__half *)&hbeta, + ((__half *)C->ptr) + offC, ldc, strideC, batchCount); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); @@ -613,7 +613,7 @@ static int sgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, ctx = A->ctx; if (cublasSgemmStridedBatched == NULL) - return error_set(ctx->error, GA_DEVSUP_ERROR, "cublasSgemmStridedBatched not available in your version of cuBLAS"); + return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmStridedBatched not available in your version of cuBLAS"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || @@ -655,7 +655,7 @@ static int sgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, ((float *)A->ptr) + offA, (int)lda, strideA, ((float *)B->ptr) + offB, (int)ldb, strideB, &beta, - ((float *)C->ptr) + offC, (int)ldc, strideB, + ((float *)C->ptr) + offC, (int)ldc, strideC, batchCount); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); @@ -691,7 +691,7 @@ static int dgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, ctx = A->ctx; if (cublasDgemmStridedBatched == NULL) - return error_set(ctx->error, GA_DEVSUP_ERROR, "cublasDgemmStridedBatched not available in your version of cuBLAS"); + return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasDgemmStridedBatched not available in your version of cuBLAS"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || @@ -733,7 +733,7 @@ static int dgemm3D(cb_order order, cb_transpose transA, cb_transpose transB, ((double *)A->ptr) + offA, (int)lda, strideA, ((double *)B->ptr) + offB, (int)ldb, strideB, &beta, - ((double *)C->ptr) + offC, (int)ldc, strideB, + ((double *)C->ptr) + offC, (int)ldc, strideC, batchCount); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index 425b0bdd12..bbe4b96039 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -260,38 +260,48 @@ int gpublas_dgerBatch(cb_order order, size_t M, size_t N, double alpha, } -int gpublas_hgemm3d( +#define BLAS_OP3F(b, name, args) \ + gpucontext *ctx; \ + if (batchCount == 0) return GA_NO_ERROR; \ + ctx = gpudata_context(b); \ + if (flags != 0) return error_set(ctx->err, GA_INVALID_ERROR, "flags is not 0"); \ + if (ctx->blas_ops->name) \ + return ctx->blas_ops->name args; \ + else \ + return error_fmt(ctx->err, GA_DEVSUP_ERROR, "Blas operation not supported by library in use: %s", #name) + +int gpublas_hgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags) { - BLAS_OPBF(A, hgemm3d, - (order, transA, transB, M, N, K, alpha, A, lda, strideA, - B, ldb, strideB, beta, C, ldc, strideC, batchCount)); + BLAS_OP3F(A, hgemm3D, + (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA, + B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount)); } -int gpublas_sgemm3d( +int gpublas_sgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags) { - BLAS_OPBF(A, sgemm3d, - (order, transA, transB, M, N, K, alpha, A, lda, strideA, - B, ldb, strideB, beta, C, ldc, strideC, batchCount)); + BLAS_OP3F(A, sgemm3D, + (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA, + B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount)); } -int gpublas_dgemm3d( +int gpublas_dgemm3D( cb_order order, cb_transpose transA, cb_transpose transB, - size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, + size_t M, size_t N, size_t K, double alpha, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount, int flags) { - BLAS_OPBF(A, dgemm3d, - (order, transA, transB, M, N, K, alpha, A, lda, strideA, - B, ldb, strideB, beta, C, ldc, strideC, batchCount)); + BLAS_OP3F(A, dgemm3D, + (order, transA, transB, M, N, K, alpha, A, offA, lda, strideA, + B, offB, ldb, strideB, beta, C, offC, ldc, strideC, batchCount)); } diff --git a/src/private.h b/src/private.h index 555c178753..934f51f3de 100644 --- a/src/private.h +++ b/src/private.h @@ -214,23 +214,23 @@ struct _gpuarray_blas_ops { gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags); - int (*hgemm3d)(cb_order order, cb_transpose transA, cb_transpose transB, + int (*hgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount); - int (*sgemm3d)(cb_order order, cb_transpose transA, cb_transpose transB, + int (*sgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - float beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + float beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount); - int (*dgemm3d)(cb_order order, cb_transpose transA, cb_transpose transB, + int (*dgemm3D)(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, - gpudata *A, size_t lda, ssize_t strideA, - gpudata *B, size_t ldb, ssize_t strideB, - double beta, gpudata *C, size_t ldc, ssize_t strideC, + gpudata *A, size_t offA, size_t lda, ssize_t strideA, + gpudata *B, size_t offB, size_t ldb, ssize_t strideB, + double beta, gpudata *C, size_t offC, size_t ldc, ssize_t strideC, size_t batchCount); }; From a7ade4a3bc53cbbdcdbe1d2361b176d2a3ed8d0b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Jun 2017 15:38:04 -0400 Subject: [PATCH 362/597] Use the type provided by gpuarray/types.h --- src/gpuarray/util.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/gpuarray/util.h b/src/gpuarray/util.h index 988aee9083..23f6c3f432 100644 --- a/src/gpuarray/util.h +++ b/src/gpuarray/util.h @@ -99,18 +99,16 @@ GPUARRAY_PUBLIC void gpuarray_elemwise_collapse(unsigned int n, size_t *dim, ssize_t **strs); -typedef uint16_t ga_half_t; - /* code strongly inspired from https://github.com/numpy/numpy/blob/master/numpy/core/src/npymath/halffloat.c#L246 */ -static inline ga_half_t ga_float2half(float f) { +static inline half_t ga_float2half(float f) { union { float f; uint32_t bits; } bf; union { - ga_half_t h; + half_t h; uint16_t bits; } bh; From 4f396cdfb6cd16e72945cd97a28715fd9dd78ca9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Jun 2017 16:38:14 -0400 Subject: [PATCH 363/597] Add fallback to the old code when the new functions don't work. --- src/gpuarray_array_blas.c | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 01d96a1598..3eb1d0bd38 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -649,6 +649,62 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph break; } + if (err == GA_DEVSUP_ERROR) { + gpudata **A_datas = NULL, **B_datas = NULL, **C_datas = NULL; + size_t *A_offsets = NULL, *B_offsets = NULL, *C_offsets = NULL; + size_t i; + + A_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); + B_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); + C_datas = (gpudata**)malloc(batchCount * sizeof(gpudata*)); + + A_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); + B_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); + C_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); + + if (A_datas == NULL || B_datas == NULL || C_datas == NULL || + A_offsets == NULL || B_offsets == NULL || C_offsets) { + err = error_sys(ctx->err, "malloc"); + goto old_cleanup; + } + + for (i = 0; i < batchCount; i++) { + A_datas[i] = Ap->data; + B_datas[i] = Bp->data; + C_datas[i] = Cp->data; + A_offsets[i] = (Ap->offset + i * Ap->strides[0]) / elsize; + B_offsets[i] = (Bp->offset + i * Bp->strides[0]) / elsize; + C_offsets[i] = (Cp->offset + i * Cp->strides[0]) / elsize; + } + + switch (C->typecode) { + case GA_HALF: + err = gpublas_hgemmBatch(o, transA, transB, m, n, k, (float)alpha, + A_datas, A_offsets, lda, + B_datas, B_offsets, ldb, + (float)beta, + C_datas, C_offsets, ldc, batchCount, 0); + break; + case GA_FLOAT: + err = gpublas_sgemmBatch(o, transA, transB, m, n, k, (float)alpha, + A_datas, A_offsets, lda, + B_datas, B_offsets, ldb, + (float)beta, + C_datas, C_offsets, ldc, batchCount, 0); + break; + case GA_DOUBLE: + err = gpublas_dgemmBatch(o, transA, transB, m, n, k, (double)alpha, + A_datas, A_offsets, lda, + B_datas, B_offsets, ldb, + (double)beta, + C_datas, C_offsets, ldc, batchCount, 0); + break; + } + old_cleanup: + free(A_datas); free(B_datas); free(C_datas); + free(A_offsets); free(B_offsets); free(C_offsets); + } + cleanup: if (Ap == ©A) GpuArray_clear(©A); From 7c14e972db685571e358dbcc6d65e92766778199 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Jun 2017 08:59:26 -0400 Subject: [PATCH 364/597] Fix definition of half. --- src/gpuarray/util.h | 6 ++++-- tests/check_util.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/gpuarray/util.h b/src/gpuarray/util.h index 23f6c3f432..a8a58ca4d3 100644 --- a/src/gpuarray/util.h +++ b/src/gpuarray/util.h @@ -99,16 +99,18 @@ GPUARRAY_PUBLIC void gpuarray_elemwise_collapse(unsigned int n, size_t *dim, ssize_t **strs); +typedef struct _ga_half_t { uint16_t h; } ga_half_t; + /* code strongly inspired from https://github.com/numpy/numpy/blob/master/numpy/core/src/npymath/halffloat.c#L246 */ -static inline half_t ga_float2half(float f) { +static inline ga_half_t ga_float2half(float f) { union { float f; uint32_t bits; } bf; union { - half_t h; + ga_half_t h; uint16_t bits; } bh; diff --git a/tests/check_util.c b/tests/check_util.c index b534421308..bcdde668a5 100644 --- a/tests/check_util.c +++ b/tests/check_util.c @@ -129,20 +129,20 @@ START_TEST(test_float2half) { 65520.f, }; const ga_half_t h[] = { - 0x0001u, /* 2e-24 */ - 0x0000u, /* 0 */ - 0x0000u, /* 0 */ - 0x3c01u, /* 1.0 + 2e-10 */ - 0x3c00u, /* 1.0 */ - 0x7bffu, /* 65504 */ - 0x7c00u, /* Inf */ + {0x0001u}, /* 2e-24 */ + {0x0000u}, /* 0 */ + {0x0000u}, /* 0 */ + {0x3c01u}, /* 1.0 + 2e-10 */ + {0x3c00u}, /* 1.0 */ + {0x7bffu}, /* 65504 */ + {0x7c00u}, /* Inf */ }; unsigned int i; ga_half_t hr; for (i = 0; i < sizeof(f)/sizeof(f[0]); i++) { hr = ga_float2half(f[i]); - ck_assert_int_eq(hr, h[i]); + ck_assert_int_eq(hr.h, h[i].h); } } END_TEST From 71a883c065a42e26c7ac47db638031a6548d9d31 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Jun 2017 15:34:34 -0400 Subject: [PATCH 365/597] Fix the test for recent enough check. --- tests/CMakeLists.txt | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index aee8d7200c..74cf17f7b2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,4 +1,4 @@ -include(CheckSymbolExists) +include(CheckCSourceCompiles) find_package(PkgConfig) pkg_search_module(CHECK check) @@ -17,8 +17,15 @@ if(NOT CHECK_FOUND) endif() if(CHECK_FOUND) - set(CMAKE_REQUIRED_INCLUDE ${CHECK_INCLUDE_DIRS}) - CHECK_SYMBOL_EXISTS(ck_assert_ptr_ne "check.h" CHECK_FUNCS) + set(CMAKE_REQUIRED_FLAGS ${CHECK_C_FLAGS} ${CHECK_LDFLAGS_OTHERS}) + set(CMAKE_REQUIRED_INCLUDES ${CHECK_INCLUDE_DIRS}) + set(CMAKE_REQUIRED_LIBRARIES ${CHECK_LIBRARIES}) + CHECK_C_SOURCE_COMPILES( + "#include + int main() { + ck_assert_ptr_ne(NULL, NULL); + }" + CHECK_FUNCS) if (NOT CHECK_FUNCS) set(CHECK_FOUND 0) endif() From 9fa1bc277d36b5cca81081d56ce8650dbf2bbd4d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Jun 2017 15:35:00 -0400 Subject: [PATCH 366/597] Add a test for a reshape of 0-sized object. --- tests/check_array.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/check_array.c b/tests/check_array.c index 59de293521..f309f9b50b 100644 --- a/tests/check_array.c +++ b/tests/check_array.c @@ -270,6 +270,20 @@ START_TEST(test_take1_offset) { } END_TEST +START_TEST(test_reshape_0) { + /* This tests that we don't segfault when reshaping 0-sized arrays */ + const size_t odims[3] = {24, 0, 33}; + const size_t ndims1[3] = {0, 24, 33}; + const size_t ndims2[3] = {24, 33, 0}; + + GpuArray v; + ga_assert_ok(GpuArray_empty(&v, ctx, GA_FLOAT, 3, odims, GA_C_ORDER)); + ga_assert_ok(GpuArray_reshape_inplace(&v, 3, ndims1, GA_ANY_ORDER)); + ga_assert_ok(GpuArray_reshape_inplace(&v, 3, odims, GA_ANY_ORDER)); + ga_assert_ok(GpuArray_reshape_inplace(&v, 3, ndims2, GA_ANY_ORDER)); +} +END_TEST + Suite *get_suite(void) { Suite *s = suite_create("array"); TCase *tc = tcase_create("take1"); @@ -277,6 +291,7 @@ Suite *get_suite(void) { tcase_set_timeout(tc, 8.0); tcase_add_test(tc, test_take1_ok); tcase_add_test(tc, test_take1_offset); + tcase_add_test(tc, test_reshape_0); suite_add_tcase(s, tc); return s; } From c44195195a278011f99be916a36cec5aa30c478c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Jun 2017 15:36:24 -0400 Subject: [PATCH 367/597] Fix crash on reshape of 0-size. --- src/gpuarray_array.c | 58 +++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 8deff26249..fccb6c569d 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -696,41 +696,43 @@ int GpuArray_reshape_inplace(GpuArray *a, unsigned int nd, if (newstrides == NULL) return error_sys(ctx->err, "calloc"); - while (ni < nd && oi < a->nd) { - np = newdims[ni]; - op = a->dimensions[oi]; + if (newsize != 0) { + while (ni < nd && oi < a->nd) { + np = newdims[ni]; + op = a->dimensions[oi]; + + while (np != op) { + if (np < op) { + np *= newdims[nj++]; + } else { + op *= a->dimensions[oj++]; + } + } - while (np != op) { - if (np < op) { - np *= newdims[nj++]; - } else { - op *= a->dimensions[oj++]; + for (ok = oi; ok < oj - 1; ok++) { + if (ord == GA_F_ORDER) { + if (a->strides[ok+1] != (ssize_t)a->dimensions[ok]*a->strides[ok]) + goto need_copy; + } else { + if (a->strides[ok] != (ssize_t)a->dimensions[ok+1]*a->strides[ok+1]) + goto need_copy; + } } - } - for (ok = oi; ok < oj - 1; ok++) { if (ord == GA_F_ORDER) { - if (a->strides[ok+1] != (ssize_t)a->dimensions[ok]*a->strides[ok]) - goto need_copy; + newstrides[ni] = a->strides[oi]; + for (nk = ni + 1; nk < nj; nk++) { + newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1]; + } } else { - if (a->strides[ok] != (ssize_t)a->dimensions[ok+1]*a->strides[ok+1]) - goto need_copy; - } - } - - if (ord == GA_F_ORDER) { - newstrides[ni] = a->strides[oi]; - for (nk = ni + 1; nk < nj; nk++) { - newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1]; - } - } else { - newstrides[nj-1] = a->strides[oj-1]; - for (nk = nj-1; nk > ni; nk--) { - newstrides[nk-1] = newstrides[nk]*newdims[nk]; + newstrides[nj-1] = a->strides[oj-1]; + for (nk = nj-1; nk > ni; nk--) { + newstrides[nk-1] = newstrides[nk]*newdims[nk]; + } } + ni = nj++; + oi = oj++; } - ni = nj++; - oi = oj++; } /* Fixup trailing ones */ From 06d4b02630ae23a4eaf44c1b2bdbcc02b5fd33fb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Jun 2017 19:10:45 -0400 Subject: [PATCH 368/597] Restore the IndexError in pygpu_index. --- pygpu/gpuarray.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 72fa6d4fb8..6de211dd72 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1412,7 +1412,10 @@ cdef GpuArray pygpu_index(GpuArray a, const ssize_t *starts, const ssize_t *stops, const ssize_t *steps): cdef GpuArray res res = new_GpuArray(type(a), a.context, a.base) - array_index(res, a, starts, stops, steps) + try: + array_index(res, a, starts, stops, steps) + except ValueError, e: + raise IndexError, "index out of bounds" return res cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, From 839f354435e82d7455adb061c59476d4c824e63c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 12 Jun 2017 13:41:05 -0400 Subject: [PATCH 369/597] Change kernel_alloc to return an error code. --- src/gpuarray_buffer.c | 7 +-- src/gpuarray_buffer_cuda.c | 64 +++++++++++--------------- src/gpuarray_buffer_opencl.c | 88 +++++++++++++++--------------------- src/private.h | 8 ++-- 4 files changed, 71 insertions(+), 96 deletions(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 52af2ddf72..3f143d1da9 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -163,9 +163,10 @@ gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count, const int *typecodes, int flags, int *ret, char **err_str) { gpukernel *res; - res = ctx->ops->kernel_alloc(ctx, count, strings, lengths, fname, numargs, - typecodes, flags, err_str); - if (res == NULL && ret) + int err; + err = ctx->ops->kernel_alloc(&res, ctx, count, strings, lengths, fname, + numargs, typecodes, flags, err_str); + if (err != GA_NO_ERROR && ret != NULL) *ret = ctx->err->code; return res; } diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 06307f91b6..6cbf1a0da8 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1270,10 +1270,10 @@ static void _cuda_freekernel(gpukernel *k) { } } -static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, - const char **strings, const size_t *lengths, - const char *fname, unsigned int argcount, - const int *types, int flags, char **err_str) { +static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, + const char **strings, const size_t *lengths, + const char *fname, unsigned int argcount, + const int *types, int flags, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb src = STRB_STATIC_INIT; strb bin = STRB_STATIC_INIT; @@ -1285,32 +1285,25 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, unsigned int i; int major, minor; - if (count == 0) { - error_set(ctx->err, GA_VALUE_ERROR, "String count is 0"); - return NULL; - } + if (count == 0) + return error_set(ctx->err, GA_VALUE_ERROR, "String count is 0"); - if (flags & GA_USE_OPENCL) { - error_set(ctx->err, GA_DEVSUP_ERROR, "OpenCL kernel not supported on cuda devices"); - return NULL; - } + if (flags & GA_USE_OPENCL) + return error_set(ctx->err, GA_DEVSUP_ERROR, "OpenCL kernel not supported on cuda devices"); - if (flags & GA_USE_BINARY) { - error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Binary mode not supported any more"); - return NULL; - } + if (flags & GA_USE_BINARY) + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Binary mode not supported any more"); cuda_enter(ctx); err = cuCtxGetDevice(&dev); if (err != CUDA_SUCCESS) { cuda_exit(ctx); - error_cuda(ctx->err, "cuCtxGetDevice", err); - return NULL; + return error_cuda(ctx->err, "cuCtxGetDevice", err); } if (get_cc(dev, &major, &minor, ctx->err) != GA_NO_ERROR) - return NULL; + return ctx->err->code; // GA_USE_CLUDA is done later // GA_USE_SMALL will always work @@ -1318,14 +1311,13 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); - error_set(ctx->err, GA_DEVSUP_ERROR, "Requested double support and current device doesn't support them"); - return NULL; + return error_set(ctx->err, GA_DEVSUP_ERROR, "Requested double support and current device doesn't support them"); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); - error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex support is not there yet."); + return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex support is not there yet."); } if (flags & GA_USE_CLUDA) { @@ -1347,17 +1339,17 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, strb_append0(&src); if (strb_error(&src)) { - error_sys(ctx->err, "strb"); strb_clear(&src); cuda_exit(ctx); - return NULL; + return error_sys(ctx->err, "strb"); } res = (gpukernel *)cache_get(ctx->kernel_cache, &src); if (res != NULL) { res->refcnt++; strb_clear(&src); - return res; + *k = res; + return GA_NO_ERROR; } if (compile(ctx, &src, &bin, &log) != GA_NO_ERROR) { @@ -1375,25 +1367,23 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, strb_clear(&bin); strb_clear(&log); cuda_exit(ctx); - return NULL; + return ctx->err->code; } strb_clear(&log); if (strb_error(&bin)) { - error_sys(ctx->err, "strb"); strb_clear(&src); strb_clear(&bin); cuda_exit(ctx); - return NULL; + return error_sys(ctx->err, "strb"); } res = calloc(1, sizeof(*res)); if (res == NULL) { - error_sys(ctx->err, "calloc"); strb_clear(&src); strb_clear(&bin); cuda_exit(ctx); - return NULL; + return error_sys(ctx->err, "calloc"); } /* Don't clear bin after this */ @@ -1403,20 +1393,18 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { - error_sys(ctx->err, "calloc"); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - return NULL; + return error_sys(ctx->err, "calloc"); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { - error_sys(ctx->err, "calloc"); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - return NULL; + return error_sys(ctx->err, "calloc"); } err = cuModuleLoadData(&res->m, bin.s); @@ -1425,16 +1413,15 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - return NULL; + return error_cuda(ctx->err, "cuModuleLoadData", err); } err = cuModuleGetFunction(&res->k, res->m, fname); if (err != CUDA_SUCCESS) { - error_cuda(ctx->err, "cuModuleGetFunction", err); _cuda_freekernel(res); strb_clear(&src); cuda_exit(ctx); - return NULL; + return error_cuda(ctx->err, "cuModuleGetFunction", err); } res->ctx = ctx; @@ -1451,7 +1438,8 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, } else { strb_clear(&src); } - return res; + *k = res; + return GA_NO_ERROR; } static void cuda_retainkernel(gpukernel *k) { diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 8e97f50167..b49cf445aa 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -28,10 +28,10 @@ static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r); static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags); static void cl_release(gpudata *b); static void cl_free_ctx(cl_ctx *ctx); -static gpukernel *cl_newkernel(gpucontext *ctx, unsigned int count, - const char **strings, const size_t *lengths, - const char *fname, unsigned int argcount, - const int *types, int flags, char **err_str); +static int cl_newkernel(gpukernel **k, gpucontext *ctx, unsigned int count, + const char **strings, const size_t *lengths, + const char *fname, unsigned int argcount, + const int *types, int flags, char **err_str); static const char CL_CONTEXT_PREAMBLE[] = "#define GA_WARP_SIZE %lu\n"; // to be filled by cl_make_ctx() @@ -187,8 +187,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { rlk[0] = dummy_kern; len = sizeof(dummy_kern); // this dummy kernel does not require a CLUDA preamble - m = cl_newkernel((gpucontext *)res, 1, rlk, &len, "kdummy", 0, NULL, 0, NULL); - if (m == NULL) + if (cl_newkernel(&m, (gpucontext *)res, 1, rlk, &len, "kdummy", 0, NULL, 0, NULL) != GA_NO_ERROR) goto fail; ret = cl_property((gpucontext *)res, NULL, m, GA_KERNEL_PROP_PREFLSIZE, &warp_size); if (ret != GA_NO_ERROR) @@ -770,8 +769,9 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { rlk[0] = local_kern; type = GA_BUFFER; - m = cl_newkernel((gpucontext *)ctx, 1, rlk, &sz, "kmemset", 1, &type, 0, NULL); - if (m == NULL) return ctx->err->code; + r = cl_newkernel(&m, (gpucontext *)ctx, 1, rlk, &sz, "kmemset", 1, &type, 0, NULL); + if (r != GA_NO_ERROR) + return r; /* Cheap kernel scheduling */ res = cl_property(NULL, NULL, m, GA_KERNEL_PROP_MAXLSIZE, &ls); @@ -814,10 +814,10 @@ static int cl_check_extensions(const char **preamble, unsigned int *count, return GA_NO_ERROR; } -static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, - const char **strings, const size_t *lengths, - const char *fname, unsigned int argcount, - const int *types, int flags, char **err_str) { +static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, + const char **strings, const size_t *lengths, + const char *fname, unsigned int argcount, + const int *types, int flags, char **err_str) { cl_ctx *ctx = (cl_ctx *)c; gpukernel *res; cl_device_id dev; @@ -834,41 +834,33 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, ASSERT_CTX(ctx); - if (count == 0) { - error_set(ctx->err, GA_VALUE_ERROR, "Empty kernel source list"); - return NULL; - } + if (count == 0) + return error_set(ctx->err, GA_VALUE_ERROR, "Empty kernel source list"); dev = get_dev(ctx->ctx, ctx->err); - if (dev == NULL) return NULL; + if (dev == NULL) return ctx->err->code; if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive - if (flags & ~GA_USE_BINARY) { - error_set(ctx->err, GA_INVALID_ERROR, "Cannot combine GA_USE_BINARY with any other flag"); - return NULL; - } + if (flags & ~GA_USE_BINARY) + return error_set(ctx->err, GA_INVALID_ERROR, "Cannot combine GA_USE_BINARY with any other flag"); + // We need the length for binary data and there is only one blob. - if (count != 1 || lengths == NULL || lengths[0] == 0) { - error_set(ctx->err, GA_VALUE_ERROR, "GA_USE_BINARY requires the length to be specified"); - return NULL; - } + if (count != 1 || lengths == NULL || lengths[0] == 0) + return error_set(ctx->err, GA_VALUE_ERROR, "GA_USE_BINARY requires the length to be specified"); + p = clCreateProgramWithBinary(ctx->ctx, 1, &dev, lengths, (const unsigned char **)strings, NULL, &err); - if (err != CL_SUCCESS) { - error_cl(ctx->err, "clCreateProgramWithBinary", err); - return NULL; - } + if (err != CL_SUCCESS) + return error_cl(ctx->err, "clCreateProgramWithBinary", err); } else { if (cl_check_extensions(preamble, &n, flags, ctx)) - return NULL; + return ctx->err->code; if (n != 0) { news = calloc(count+n, sizeof(const char *)); - if (news == NULL) { - error_sys(ctx->err, "calloc"); - return NULL; - } + if (news == NULL) + return error_sys(ctx->err, "calloc"); memcpy(news, preamble, n*sizeof(const char *)); memcpy(news+n, strings, count*sizeof(const char *)); if (lengths == NULL) { @@ -877,8 +869,7 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, newl = calloc(count+n, sizeof(size_t)); if (newl == NULL) { free(news); - error_sys(ctx->err, "calloc"); - return NULL; + return error_sys(ctx->err, "calloc"); } memcpy(newl+n, lengths, count*sizeof(size_t)); } @@ -893,8 +884,7 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, free(news); free(newl); } - error_cl(ctx->err, "clCreateProgramWithSource", err); - return NULL; + return error_cl(ctx->err, "clCreateProgramWithSource", err); } } @@ -936,8 +926,7 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, free(news); free(newl); } - error_cl(ctx->err, "clBuildProgram", err); - return NULL; + return error_cl(ctx->err, "clBuildProgram", err); } if (n != 0) { @@ -946,10 +935,9 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, } res = malloc(sizeof(*res)); - if (res == NULL) { - error_sys(ctx->err, "malloc"); - return NULL; - } + if (res == NULL) + return error_sys(ctx->err, "malloc"); + res->refcnt = 1; res->ev = NULL; res->argcount = argcount; @@ -962,25 +950,23 @@ static gpukernel *cl_newkernel(gpucontext *c, unsigned int count, TAG_KER(res); if (err != CL_SUCCESS) { cl_releasekernel(res); - error_cl(ctx->err, "clCreateKernel", err); - return NULL; + return error_cl(ctx->err, "clCreateKernel", err); } res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { cl_releasekernel(res); - error_sys(ctx->err, "calloc"); - return NULL; + return error_sys(ctx->err, "calloc"); } memcpy(res->types, types, argcount * sizeof(int)); res->evr = calloc(argcount, sizeof(cl_event *)); if (res->evr == NULL) { cl_releasekernel(res); - error_sys(ctx->err, "calloc"); - return NULL; + return error_sys(ctx->err, "calloc"); } - return res; + *k = res; + return GA_NO_ERROR; } static void cl_retainkernel(gpukernel *k) { diff --git a/src/private.h b/src/private.h index 4286d3fd21..9b74dad203 100644 --- a/src/private.h +++ b/src/private.h @@ -88,10 +88,10 @@ struct _gpuarray_buffer_ops { int (*buffer_read)(void *dst, gpudata *src, size_t srcoff, size_t sz); int (*buffer_write)(gpudata *dst, size_t dstoff, const void *src, size_t sz); int (*buffer_memset)(gpudata *dst, size_t dstoff, int data); - gpukernel *(*kernel_alloc)(gpucontext *ctx, unsigned int count, - const char **strings, const size_t *lengths, - const char *fname, unsigned int numargs, - const int *typecodes, int flags, char **err_str); + int (*kernel_alloc)(gpukernel **k, gpucontext *ctx, unsigned int count, + const char **strings, const size_t *lengths, + const char *fname, unsigned int numargs, + const int *typecodes, int flags, char **err_str); void (*kernel_retain)(gpukernel *k); void (*kernel_release)(gpukernel *k); int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a); From 8041c47eb39fac07891686e688a000709580a9c8 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 12 Jun 2017 16:04:47 -0400 Subject: [PATCH 370/597] Remove unused check_contig. --- pygpu/tests/test_tools.py | 20 +------------------- pygpu/tools.py | 23 ----------------------- 2 files changed, 1 insertion(+), 42 deletions(-) diff --git a/pygpu/tests/test_tools.py b/pygpu/tests/test_tools.py index fafc835fcf..72a22387e6 100644 --- a/pygpu/tests/test_tools.py +++ b/pygpu/tests/test_tools.py @@ -1,29 +1,11 @@ from pygpu.tools import (as_argument, Argument, ArrayArg, ScalarArg, - check_contig, check_args, Counter, lfu_cache) + check_args, Counter, lfu_cache) from .support import (guard_devsup, rand, check_flags, check_meta, check_all, context, gen_gpuarray, dtypes_no_complex) -def test_check_contig_1(): - ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context) - bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context) - n, offsets, contig = check_contig((ag, bg)) - assert n == 1000 - assert offsets == (0, 0) - assert contig - - -def test_check_contig_2(): - ac, ag = gen_gpuarray((50, 1, 20), 'float32', ctx=context) - bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context, sliced=2) - n, offsets, contig = check_contig((ag, bg)) - assert n == None - assert offsets == None - assert not contig - - def test_check_args_simple(): ac, ag = gen_gpuarray((50,), 'float32', ctx=context) bc, bg = gen_gpuarray((50,), 'float32', ctx=context) diff --git a/pygpu/tools.py b/pygpu/tools.py index 8e22a20c64..a9b24a8465 100644 --- a/pygpu/tools.py +++ b/pygpu/tools.py @@ -65,29 +65,6 @@ def spec(self): return self.dtype -def check_contig(args): - dims = None - c_contig = f_contig = True - offsets = [] - for arg in args: - if not isinstance(arg, GpuArray): - offsets.append(None) - continue - - if dims is None: - dims = arg.shape - n = arg.size - elif arg.shape != dims: - return None, None, False - offsets.append(arg.offset) - fl = arg.flags - c_contig = c_contig and fl['C_CONTIGUOUS'] - f_contig = f_contig and fl['F_CONTIGUOUS'] - if not (c_contig or f_contig): - return None, None, False - return n, tuple(offsets), True - - def check_args(args, collapse=False, broadcast=False): """ Returns the properties of arguments and checks if they all match From 6d38807c2d909202be3a425a973ab8aca49660a9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 12 Jun 2017 16:07:33 -0400 Subject: [PATCH 371/597] flake8 --- pygpu/tests/test_tools.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pygpu/tests/test_tools.py b/pygpu/tests/test_tools.py index 72a22387e6..640b5685a6 100644 --- a/pygpu/tests/test_tools.py +++ b/pygpu/tests/test_tools.py @@ -1,9 +1,6 @@ -from pygpu.tools import (as_argument, Argument, ArrayArg, ScalarArg, - check_args, Counter, lfu_cache) +from pygpu.tools import check_args - -from .support import (guard_devsup, rand, check_flags, check_meta, check_all, - context, gen_gpuarray, dtypes_no_complex) +from .support import context, gen_gpuarray def test_check_args_simple(): @@ -102,7 +99,7 @@ def test_check_args_broadcast_2(): offseted_inner=True) bc, bg = gen_gpuarray((50, 1, 20), 'float32', ctx=context) n, nd, dims, strs, offsets = check_args((ag, bg), collapse=True, - broadcast=True) + broadcast=True) assert n == 1000 assert nd == 2 assert dims == (50, 20) From df1685938f8bedf6bc29cc5d7609f89450c07585 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 12 Jun 2017 16:08:28 -0400 Subject: [PATCH 372/597] Remove lfu_cache, it's unused. --- pygpu/tools.py | 52 -------------------------------------------------- 1 file changed, 52 deletions(-) diff --git a/pygpu/tools.py b/pygpu/tools.py index a9b24a8465..b0c43cb028 100644 --- a/pygpu/tools.py +++ b/pygpu/tools.py @@ -168,58 +168,6 @@ def check_args(args, collapse=False, broadcast=False): return n, nd, dims, tuple(strs), tuple(offsets) -class Counter(dict): - 'Mapping where default values are zero' - def __missing__(self, key): - return 0 - - -def lfu_cache(maxsize=20): - def decorating_function(user_function): - cache = {} - use_count = Counter() - - @functools.wraps(user_function) - def wrapper(*key): - use_count[key] += 1 - - try: - result = cache[key] - wrapper.hits += 1 - except KeyError: - result = user_function(*key) - cache[key] = result - wrapper.misses += 1 - - # purge least frequently used cache entry - if len(cache) > wrapper.maxsize: - for key, _ in nsmallest(wrapper.maxsize // 10, - six.iteritems(use_count), - key=itemgetter(1)): - del cache[key], use_count[key] - - return result - - def clear(): - cache.clear() - use_count.clear() - wrapper.hits = wrapper.misses = 0 - - @functools.wraps(user_function) - def get(*key): - result = cache[key] - use_count[key] += 1 - wrapper.hits += 1 - return result - - wrapper.hits = wrapper.misses = 0 - wrapper.maxsize = maxsize - wrapper.clear = clear - wrapper.get = get - return wrapper - return decorating_function - - def lru_cache(maxsize=20): def decorating_function(user_function): cache = {} From 5b54f6d753805f1ca3f6a1607a49ac6c47da62c5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 12 Jun 2017 16:31:17 -0400 Subject: [PATCH 373/597] Flake8 --- pygpu/dtypes.py | 14 +++--- pygpu/operations.py | 5 ++- pygpu/reduction.py | 7 ++- pygpu/tests/main.py | 37 ++++------------ pygpu/tests/support.py | 8 +++- pygpu/tests/test_blas.py | 76 ++++++++++++++++++--------------- pygpu/tests/test_elemwise.py | 21 ++++++--- pygpu/tests/test_gpu_ndarray.py | 44 ++++++++++--------- pygpu/tests/test_reduction.py | 5 ++- 9 files changed, 115 insertions(+), 102 deletions(-) diff --git a/pygpu/dtypes.py b/pygpu/dtypes.py index 7498a3a9c7..cc4d6b3402 100644 --- a/pygpu/dtypes.py +++ b/pygpu/dtypes.py @@ -71,18 +71,20 @@ def register_dtype(dtype, c_names): def _fill_dtype_registry(): - from sys import platform - register_dtype(np.bool, ["ga_bool", "bool"]) register_dtype(np.int8, ["ga_byte", "char", "signed char"]) register_dtype(np.uint8, ["ga_ubyte", "unsigned char"]) - register_dtype(np.int16, ["ga_short", "short", "signed short", "signed short int", "short signed int"]) - register_dtype(np.uint16, ["ga_ushort", "unsigned short", "unsigned short int", "short unsigned int"]) + register_dtype(np.int16, ["ga_short", "short", "signed short", + "signed short int", "short signed int"]) + register_dtype(np.uint16, ["ga_ushort", "unsigned short", + "unsigned short int", "short unsigned int"]) register_dtype(np.int32, ["ga_int", "int", "signed int"]) register_dtype(np.uint32, ["ga_uint", "unsigned", "unsigned int"]) - register_dtype(np.int64, ["ga_long", "long int", "signed long int", "long signed int"]) - register_dtype(np.uint64, ["ga_ulong", "unsigned long", "unsigned long int", "long unsigned int"]) + register_dtype(np.int64, ["ga_long", "long int", "signed long int", + "long signed int"]) + register_dtype(np.uint64, ["ga_ulong", "unsigned long", + "unsigned long int", "long unsigned int"]) register_dtype(np.intp, ["ga_ssize", "ssize_t"]) register_dtype(np.uintp, ["ga_size", "size_t"]) diff --git a/pygpu/operations.py b/pygpu/operations.py index bdb476b506..4908bb7bae 100644 --- a/pygpu/operations.py +++ b/pygpu/operations.py @@ -2,7 +2,7 @@ from .gpuarray import _split, _concatenate, dtype_to_typecode from .dtypes import upcast -from . import array, asarray +from . import asarray def atleast_1d(*arys): @@ -82,7 +82,8 @@ def array_split(ary, indices_or_sections, axis=0): # this madness is to support the numpy interface # it is supported by tests, but little else divs = (list(range(neach + 1, (neach + 1) * extra + 1, neach + 1)) + - list(range((neach + 1) * extra + neach, ary.shape[axis], neach))) + list(range((neach + 1) * extra + neach, + ary.shape[axis], neach))) res = _split(ary, divs, axis) return res diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 8053270e1e..7edd5417c9 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -29,6 +29,7 @@ def _ceil_log2(x): else: return 0 + basic_kernel = Template(""" ${preamble} @@ -141,14 +142,16 @@ def __init__(self, context, dtype_out, neutral, reduce_expr, redux, if isinstance(arguments, str): self.arguments = parse_c_args(arguments) elif arguments is None: - self.arguments = [ArrayArg(numpy.dtype(self.dtype_out), '_reduce_input')] + self.arguments = [ArrayArg(numpy.dtype(self.dtype_out), + '_reduce_input')] else: self.arguments = arguments if (self.dtype_out == numpy.dtype('float16') or any(ar.dtype == numpy.dtype('float16') for ar in self.arguments)): - raise NotImplementedError('float16 not supported for the reduction interface') + raise NotImplementedError('float16 not supported for the ' + 'reduction interface') self.reduce_expr = reduce_expr if map_expr is None: diff --git a/pygpu/tests/main.py b/pygpu/tests/main.py index ef994e2fac..57499a5c8c 100644 --- a/pygpu/tests/main.py +++ b/pygpu/tests/main.py @@ -1,5 +1,4 @@ import os -import nose.plugins.builtin from nose.config import Config from nose.plugins.manager import PluginManager @@ -29,7 +28,7 @@ def _test_argv(self, verbose, extra_argv): List with any extra arguments to pass to nosetests. """ - #self.package_path = os.path.abspath(self.package_path) + # self.package_path = os.path.abspath(self.package_path) argv = [__file__, self.package_path] argv += ['--verbosity', str(verbose)] if extra_argv: @@ -37,17 +36,15 @@ def _test_argv(self, verbose, extra_argv): return argv def _show_system_info(self): - nose = import_nose() - import pygpu - #print ("pygpu version %s" % pygpu.__version__) + # print ("pygpu version %s" % pygpu.__version__) pygpu_dir = os.path.dirname(pygpu.__file__) - print ("pygpu is installed in %s" % pygpu_dir) + print("pygpu is installed in %s" % pygpu_dir) super(NoseTester, self)._show_system_info() def prepare_test_args(self, verbose=1, extra_argv=None, coverage=False, - capture=True, knownfailure=True): + capture=True, knownfailure=True): """ Prepare arguments for the `test` method. @@ -61,8 +58,9 @@ def prepare_test_args(self, verbose=1, extra_argv=None, coverage=False, # numpy way of doing coverage if coverage: - argv += ['--cover-package=%s' % self.package_name, '--with-coverage', - '--cover-tests', '--cover-inclusive', '--cover-erase'] + argv += ['--cover-package=%s' % self.package_name, + '--with-coverage', '--cover-tests', '--cover-inclusive', + '--cover-erase'] # Capture output only if needed if not capture: @@ -77,7 +75,7 @@ def prepare_test_args(self, verbose=1, extra_argv=None, coverage=False, return argv, plugins def test(self, verbose=1, extra_argv=None, coverage=False, capture=True, - knownfailure=True): + knownfailure=True): """ Run tests for module using nose. @@ -122,7 +120,7 @@ def test(self, verbose=1, extra_argv=None, coverage=False, capture=True, "launch pygpu.test().")) argv, plugins = self.prepare_test_args(verbose, extra_argv, coverage, - capture, knownfailure) + capture, knownfailure) # The "plugins" keyword of NumpyTestProgram gets ignored if config is # specified. Moreover, using "addplugins" instead can lead to strange @@ -130,20 +128,3 @@ def test(self, verbose=1, extra_argv=None, coverage=False, capture=True, cfg = Config(includeExe=True, plugins=PluginManager(plugins=plugins)) t = NumpyTestProgram(argv=argv, exit=False, config=cfg) return t.result - - -def main(modulename): - debug = False - - if 0: - unittest.main() - elif len(sys.argv)==2 and sys.argv[1]=="--debug": - module = __import__(modulename) - tests = unittest.TestLoader().loadTestsFromModule(module) - tests.debug() - elif len(sys.argv)==1: - module = __import__(modulename) - tests = unittest.TestLoader().loadTestsFromModule(module) - unittest.TextTestRunner(verbosity=2).run(tests) - else: - print ("options: [--debug]") diff --git a/pygpu/tests/support.py b/pygpu/tests/support.py index 2b3eb61f23..3992b2572b 100644 --- a/pygpu/tests/support.py +++ b/pygpu/tests/support.py @@ -1,6 +1,7 @@ from __future__ import print_function -import os, sys +import os +import sys import numpy from nose.plugins.skip import SkipTest @@ -22,11 +23,14 @@ dtypes_no_complex_big = ["float32", "float64", "int16", "uint16", "int32", "int64", "uint32", "uint64"] + def get_env_dev(): for name in ['GPUARRAY_TEST_DEVICE', 'DEVICE']: if name in os.environ: return os.environ[name] - raise RuntimeError("No test device specified. Specify one using the DEVICE or GPUARRAY_TEST_DEVICE environment variables.") + raise RuntimeError( + "No test device specified. Specify one using the DEVICE " + "or GPUARRAY_TEST_DEVICE environment variables.") context = gpuarray.init(get_env_dev()) diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index b5f5279aa6..112c2a5ef6 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -15,14 +15,16 @@ import pygpu.blas as gblas + def test_dot(): bools = [True, False] for N, dtype, offseted_i, sliced in product( - [1, 256, 1337], ['float32', 'float64'], bools, bools): + [1, 256, 1337], ['float32', 'float64'], bools, bools): yield dot, N, dtype, offseted_i, sliced, True, False for overwrite, init_z in product(bools, bools): yield dot, 666, 'float32', False, False, overwrite, init_z + @guard_devsup def dot(N, dtype, offseted_i, sliced, overwrite, init_z): cX, gX = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, @@ -30,10 +32,10 @@ def dot(N, dtype, offseted_i, sliced, overwrite, init_z): cY, gY = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, sliced=sliced, ctx=context) if init_z: - _, gZ = gen_gpuarray((), dtype, offseted_inner=offseted_i, - sliced=sliced, ctx=context) + gZ = gen_gpuarray((), dtype, offseted_inner=offseted_i, + sliced=sliced, ctx=context)[1] else: - _, gZ = None, None + gZ = None if dtype == 'float32': cr = fblas.sdot(cX, cY) @@ -46,7 +48,7 @@ def dot(N, dtype, offseted_i, sliced, overwrite, init_z): def test_gemv(): bools = [False, True] for shape, order, trans, offseted_i, sliced in product( - [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]): + [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]): yield gemv, shape, 'float32', order, trans, \ offseted_i, sliced, True, False for overwrite, init_y in product(bools, bools): @@ -54,13 +56,14 @@ def test_gemv(): overwrite, init_y yield gemv, (32, 32), 'float64', 'f', False, False, 1, True, False for alpha, beta, overwrite in product( - [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ overwrite, True, alpha, beta + @guard_devsup def gemv(shp, dtype, order, trans, offseted_i, sliced, - overwrite, init_y, alpha=1.0, beta=0.0): + overwrite, init_y, alpha=1.0, beta=0.0): cA, gA = gen_gpuarray(shp, dtype, order=order, offseted_inner=offseted_i, sliced=sliced, ctx=context) if trans: @@ -92,31 +95,31 @@ def test_gemm(): bools = [False, True] for (m, n, k), order, trans, offseted_o in product( [(48, 15, 32), (15, 32, 48)], list(product(*['fc']*3)), - list(product(bools, bools)), bools): + list(product(bools, bools)), bools): yield gemm, m, n, k, 'float32', order, trans, \ offseted_o, 1, False, False - for sliced, overwrite, init_res in product( - [1, 2, -1, -2], bools, bools): + for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools): yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ (False, False), False, sliced, overwrite, init_res yield gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), \ False, 1, False, False for alpha, beta, overwrite in product( - [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ (False, False), False, 1, overwrite, True, alpha, beta + @guard_devsup def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, init_res, alpha=1.0, beta=0.0): if trans[0]: - shpA = (k,m) + shpA = (k, m) else: - shpA = (m,k) + shpA = (m, k) if trans[1]: - shpB = (n,k) + shpB = (n, k) else: - shpB = (k,n) + shpB = (k, n) cA, gA = gen_gpuarray(shpA, dtype, order=order[0], offseted_outer=offseted_o, @@ -125,7 +128,7 @@ def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, offseted_outer=offseted_o, sliced=sliced, ctx=context) if init_res: - cC, gC = gen_gpuarray((m,n), dtype, order=order[2], ctx=context) + cC, gC = gen_gpuarray((m, n), dtype, order=order[2], ctx=context) else: cC, gC = None, None @@ -143,13 +146,14 @@ def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, def test_ger(): bools = [False, True] - for (m,n), order, sliced_x, sliced_y in product( - [(4,5)], 'fc', [1, 2, -2, -1], [1, 2, -2, -1]): + for (m, n), order, sliced_x, sliced_y in product( + [(4, 5)], 'fc', [1, 2, -2, -1], [1, 2, -2, -1]): yield ger, m, n, 'float32', order, sliced_x, sliced_y, False yield ger, 4, 5, 'float64', 'f', 1, 1, False for init_res, overwrite in product(bools, bools): yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite + def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): cX, gX = gen_gpuarray((m,), dtype, order, sliced=sliced_x, ctx=context) cY, gY = gen_gpuarray((n,), dtype, order, sliced=sliced_y, ctx=context) @@ -168,35 +172,37 @@ def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6) + def test_rgemmBatch_3d(): bools = [False, True] for b, (m, n, k), order, trans, offseted_o in product( - [1, 17, 31], [(24, 7, 16), (7, 16, 24)], list(product('fc', 'fc', 'c')), - list(product(bools, bools)), bools): + [1, 17, 31], [(24, 7, 16), (7, 16, 24)], + list(product('fc', 'fc', 'c')), + list(product(bools, bools)), bools): yield rgemmBatch_3d, b, m, n, k, 'float32', order, trans, \ offseted_o, 1, False, False - for sliced, overwrite, init_res in product( - [1, 2, -1, -2], bools, bools): + for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools): yield rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'c'), \ (False, False), False, sliced, overwrite, init_res - yield rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'c'), (False, False), \ - False, 1, False, False + yield rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'c'), \ + (False, False), False, 1, False, False for alpha, beta, overwrite in product( - [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): + [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): yield rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'c'), \ (False, False), False, 1, overwrite, True, alpha, beta + @guard_devsup -def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, - init_res, alpha=1.0, beta=0.0): +def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced, + overwrite, init_res, alpha=1.0, beta=0.0): if trans[0]: - shpA = (b,k,m) + shpA = (b, k, m) else: - shpA = (b,m,k) + shpA = (b, m, k) if trans[1]: - shpB = (b,n,k) + shpB = (b, n, k) else: - shpB = (b,k,n) + shpB = (b, k, n) cA, gA = gen_gpuarray(shpA, dtype, order=order[0], offseted_outer=offseted_o, @@ -205,11 +211,11 @@ def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced, overwrite offseted_outer=offseted_o, sliced=sliced, ctx=context) if init_res: - cC, gC = gen_gpuarray((b,m,n), dtype, order=order[2], ctx=context) + cC, gC = gen_gpuarray((b, m, n), dtype, order=order[2], ctx=context) else: cC, gC = None, None - cr = numpy.empty((b,m,n), dtype=dtype) + cr = numpy.empty((b, m, n), dtype=dtype) if dtype == 'float32': fn_gemm_c = fblas.sgemm else: @@ -217,9 +223,9 @@ def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced, overwrite for i in range(b): cCi = cC if cC is None else cC[i] cr[i] = fn_gemm_c(alpha, cA[i], cB[i], beta, cCi, trans_a=trans[0], - trans_b=trans[1], overwrite_c=overwrite) + trans_b=trans[1], overwrite_c=overwrite) gr = gblas.gemmBatch_3d(alpha, gA, gB, beta, gC, trans_a=trans[0], - trans_b=trans[1], overwrite_c=overwrite) + trans_b=trans[1], overwrite_c=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-5) diff --git a/pygpu/tests/test_elemwise.py b/pygpu/tests/test_elemwise.py index ecec66b2ef..6bc6f9995b 100644 --- a/pygpu/tests/test_elemwise.py +++ b/pygpu/tests/test_elemwise.py @@ -69,13 +69,17 @@ def test_all(self): test_values = [((1, 4), (6, 4)), ((2, 1, 8, 7), (2, 2, 8, 7))] for shapea, shapeb in test_values: # Sould fail: dimensions are not all equal. - self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, False) + self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, + False) # Should fail: broascast should not be done on output. - self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, True) + self.assertRaises(ValueError, self.run_ielemwise2, shapea, shapeb, + True) # Should fail: dimensions are not all equal. - self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, shapea, False) + self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, + shapea, False) # Should fail: broadcast should not be done on output. - self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, shapea, True) + self.assertRaises(ValueError, self.check_elemwise2, shapeb, shapeb, + shapea, True) # Should pass: output would be done on read-only input. self.run_ielemwise2(shapeb, shapea, broadcast=True) # Should pass: output would be done on read-only inputs. @@ -98,12 +102,14 @@ def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True): na, ga = gen_gpuarray(shapea, ctx=context, cls=elemary) nb, gb = gen_gpuarray(shapeb, ctx=context, cls=elemary) odtype = get_common_dtype(ga, gb, True) - res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context, cls=ga.__class__) + res = gpuarray.empty(output_shape, dtype=odtype, context=ga.context, + cls=ga.__class__) a_arg = as_argument(ga, 'a', read=True) b_arg = as_argument(gb, 'b', read=True) res_arg = as_argument(res, 'res', write=True) args = [res_arg, a_arg, b_arg] - oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % {'op': '+', 'out_t': dtype_to_ctype(odtype)} + oper = "res = (%(out_t)s)a %(op)s (%(out_t)s)b" % { + 'op': '+', 'out_t': dtype_to_ctype(odtype)} k = GpuElemwise(ga.context, oper, args, convert_f16=True) k(res, ga, gb, broadcast=broadcast) nres = na + nb @@ -113,7 +119,8 @@ def check_elemwise2(self, shapea, shapeb, output_shape, broadcast=True): @guard_devsup def elemwise2_ops_array(op, dtype1, dtype2, shape): ac, ag = gen_gpuarray(shape, dtype1, ctx=context, cls=elemary) - bc, bg = gen_gpuarray(shape, dtype2, nozeros=True, ctx=context, cls=elemary) + bc, bg = gen_gpuarray(shape, dtype2, nozeros=True, ctx=context, + cls=elemary) out_c = op(ac, bc) out_g = op(ag, bg) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index e0f362403e..de5a3e1f4a 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -10,7 +10,7 @@ from nose.tools import assert_raises import pygpu -from pygpu.gpuarray import GpuArray, GpuContext, GpuKernel +from pygpu.gpuarray import GpuArray, GpuKernel from .support import (guard_devsup, check_meta, check_flags, check_all, check_content, gen_gpuarray, context as ctx, dtypes_all, @@ -41,7 +41,7 @@ def test_hash(): g = pygpu.empty((2, 3), context=ctx) exc = None try: - h = hash(g) + hash(g) except TypeError as e: exc = e assert exc is not None @@ -49,7 +49,8 @@ def test_hash(): def test_bool(): for data in [numpy.empty((0, 33)), [[1]], [[0]], [], [0], [1], 0, 1]: - assert bool(pygpu.asarray(data, context=ctx)) == bool(numpy.asarray(data)) + assert bool(pygpu.asarray(data, context=ctx)) == \ + bool(numpy.asarray(data)) def test_transfer(): @@ -71,12 +72,14 @@ def transfer(shp, dtype, offseted): assert a.dtype == b.dtype == c.dtype == dtype assert c.flags.c_contiguous + def test_cast(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype1 in dtypes_no_complex: for dtype2 in dtypes_no_complex: yield cast, shp, dtype1, dtype2 + @guard_devsup def cast(shp, dtype1, dtype2): a, b = gen_gpuarray(shp, dtype1, False, ctx=ctx) @@ -153,8 +156,7 @@ def ascontiguousarray(shp, dtype, offseted_o, offseted_i, sliced, order): b = pygpu.ascontiguousarray(gpu) # numpy upcast with a view to 1d scalar. - if (sliced != 1 or shp == () or - (offseted_i and len(shp) > 1)): + if (sliced != 1 or shp == () or (offseted_i and len(shp) > 1)): assert b is not gpu if sliced == 1 and not offseted_i: assert (a.data is cpu.data) == (b.bytes is gpu.bytes) @@ -193,7 +195,7 @@ def asfortranarray(shp, dtype, offseted_outer, offseted_inner, sliced, order): if gpu.flags['F_CONTIGUOUS']: assert b.gpudata == gpu.gpudata elif (sliced != 1 or shp == () or (offseted_outer and len(shp) > 1) or - (order != 'f' and len(shp) > 1)): + (order != 'f' and len(shp) > 1)): assert b is not gpu else: assert b is gpu @@ -259,7 +261,7 @@ def empty(shp, order, dtype): def test_empty_no_dtype(): - x = pygpu.empty((), context=ctx)# no dtype and order param + x = pygpu.empty((), context=ctx) # no dtype and order param y = numpy.empty(()) check_meta(x, y) @@ -401,6 +403,7 @@ def test_read(self): self.cpu = numpy.ndarray((3, 4, 2, 5), dtype="float32", order='C') self.assertRaises(ValueError, self.gpu.read, self.cpu[:, :, 0, :]) + def test_copy_view(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: @@ -416,9 +419,10 @@ def check_memory_region(a, a_op, b, b_op): assert numpy.may_share_memory(a, a_op) == \ pygpu.gpuarray.may_share_memory(b, b_op) + @guard_devsup def copy_view(shp, dtype, offseted, order1, order2): - #TODO test copy unbroadcast! + # TODO test copy unbroadcast! a, b = gen_gpuarray(shp, dtype, offseted, order=order1, ctx=ctx) assert numpy.allclose(a, numpy.asarray(b)) @@ -458,10 +462,10 @@ def test_shape(): ((4, 3), (12, -1)), ((4, 3), (-1, 12)), ((5, 4, 3, 2), (2, -1, 12)), ((4, 2), (2, 2, -1)), # ((4, 3), (13, -1)), - ]: + ]: for offseted in [True, False]: for order1 in ['c', 'f']: - if not -1 in shps[1]: + if -1 not in shps[1]: yield shape_, shps, offseted, order1 for order2 in ['a', 'c', 'f']: yield reshape, shps, offseted, order1, order2 @@ -523,7 +527,8 @@ def test_transpose(): for sliced in [1, 2, -2, -1]: yield transpose, shp, offseted, sliced, order for perm in permutations(list(range(len(shp)))): - yield transpose_perm, shp, perm, offseted, sliced, order + yield transpose_perm, shp, perm, offseted, sliced, \ + order def transpose(shp, offseted, sliced, order): @@ -595,8 +600,6 @@ def mapping_getitem_w_int(dtype, offseted): dim = (2,) a, _a = gen_gpuarray(dim, dtype, offseted, ctx=ctx) - import sys - init_ref_count = sys.getrefcount(_a) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) @@ -628,19 +631,19 @@ def mapping_getitem_w_int(dtype, offseted): _cmpf(_a, (10, 0, 0, 0)) _cmpf(_a, -10) - #test with integer + # test with integer _cmp(_a[1], a[1]) _cmp(_a[-1], a[-1]) _cmp(_a[numpy.int64(1)], a[numpy.int64(1)]) _cmp(_a[numpy.int64(-1)], a[numpy.int64(-1)]) - #test with slice + # test with slice _cmp(_a[1:], a[1:]) _cmp(_a[1:2], a[1:2]) _cmp(_a[-1:1], a[-1:1]) _cmp(_a[6:7:], a[6:7:]) - #test with tuple (mix slice, integer, numpy.int64) + # test with tuple (mix slice, integer, numpy.int64) _cmpNs(_a[0, 0, ::numpy.int64(-1), ::-1], a[0, 0, ::-1, ::-1]) _cmpNs(_a[:, :, ::numpy.int64(-1), ::-1], a[:, :, ::-1, ::-1]) _cmpNs(_a[:, :, numpy.int64(1), -1], a[:, :, 1, -1]) @@ -654,11 +657,11 @@ def mapping_getitem_w_int(dtype, offseted): _cmpNs(_a[0, ::-2, -1], a[0, ::-2, -1]) _cmp(_a[-1, -1, -1, -2], a[-1, -1, -1, -2]) - #test ellipse + # test ellipse _cmp(_a[...], a[...]) -def _cmp(x,y): +def _cmp(x, y): assert isinstance(x, GpuArray) assert x.shape == y.shape assert x.dtype == y.dtype @@ -759,6 +762,7 @@ def test_flags(): 'carray', 'forc', 'fnc', 'farray']: yield flag_prop, p + def flag_dict(fl): c2, g2 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='c') c3, g3 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='f') @@ -766,6 +770,7 @@ def flag_dict(fl): assert c2.flags[fl] == g2.flags[fl] assert c3.flags[fl] == g3.flags[fl] + def flag_prop(p): c2, g2 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='c') c3, g3 = gen_gpuarray((2, 3), dtype='float32', ctx=ctx, order='f') @@ -806,7 +811,8 @@ def test_GpuContext(self): pickle.dumps(ctx, protocol=-1) def test_GpuKernel(self): - k = GpuKernel("KERNEL void nothing(GLOBAL_MEM ga_float *in) {in[0] = 0;}", "nothing", [], context=ctx) + k = GpuKernel("KERNEL void nothing(GLOBAL_MEM ga_float *in) " + "{in[0] = 0;}", "nothing", [], context=ctx) with self.assertRaises(RuntimeError): pickle.dumps(k) with self.assertRaises(RuntimeError): diff --git a/pygpu/tests/test_reduction.py b/pygpu/tests/test_reduction.py index aad9d556ee..4a4b5f6c2a 100644 --- a/pygpu/tests/test_reduction.py +++ b/pygpu/tests/test_reduction.py @@ -72,10 +72,13 @@ def test_red_broadcast(): nz = numpy.apply_along_axis(sum, ax, nz).astype(dtype) args = [as_argument(gx, 'a'), as_argument(gy, 'b')] - gz = ReductionKernel(context, dtype, "0", "a+b", redux, map_expr="a[i]*b[i]", arguments=args)(gx, gy, broadcast=True) + gz = ReductionKernel(context, dtype, "0", "a+b", redux, + map_expr="a[i]*b[i]", arguments=args)( + gx, gy, broadcast=True) assert numpy.allclose(nz, numpy.asarray(gz)) + def test_reduction_ops(): for axis in [None, 0, 1]: for op in ['all', 'any']: From f3e9efc2b0e22b6e82f3d727fda47ca87725ba85 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 14 Jun 2017 12:57:06 -0400 Subject: [PATCH 374/597] Add empty line for style. --- pygpu/tests/support.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pygpu/tests/support.py b/pygpu/tests/support.py index 3992b2572b..611b68d24d 100644 --- a/pygpu/tests/support.py +++ b/pygpu/tests/support.py @@ -2,6 +2,7 @@ import os import sys + import numpy from nose.plugins.skip import SkipTest From 613bcf5e9c27bca3033d300806b904075bec62c0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 14 Jun 2017 13:03:30 -0400 Subject: [PATCH 375/597] Remove use of backslash as line continuation. --- pygpu/_array.py | 16 ++++++------ pygpu/gpuarray.pyx | 22 +++++++++-------- pygpu/tests/test_blas.py | 44 ++++++++++++++++----------------- pygpu/tests/test_gpu_ndarray.py | 20 +++++++-------- 4 files changed, 52 insertions(+), 50 deletions(-) diff --git a/pygpu/_array.py b/pygpu/_array.py index 14d5879010..2fc793f838 100644 --- a/pygpu/_array.py +++ b/pygpu/_array.py @@ -143,11 +143,11 @@ def __divmod__(self, other): mod = self._empty_like_me(dtype=odtype) if odtype.kind == 'f': - tmpl = "div = floor((%(out_t)s)a / (%(out_t)s)b)," \ - "mod = fmod((%(out_t)s)a, (%(out_t)s)b)" + tmpl = ("div = floor((%(out_t)s)a / (%(out_t)s)b)," + "mod = fmod((%(out_t)s)a, (%(out_t)s)b)") else: - tmpl = "div = (%(out_t)s)a / (%(out_t)s)b," \ - "mod = a %% b" + tmpl = ("div = (%(out_t)s)a / (%(out_t)s)b," + "mod = a %% b") ksrc = tmpl % {'out_t': dtype_to_ctype(odtype)} @@ -168,11 +168,11 @@ def __rdivmod__(self, other): mod = self._empty_like_me(dtype=odtype) if odtype.kind == 'f': - tmpl = "div = floor((%(out_t)s)a / (%(out_t)s)b)," \ - "mod = fmod((%(out_t)s)a, (%(out_t)s)b)" + tmpl = ("div = floor((%(out_t)s)a / (%(out_t)s)b)," + "mod = fmod((%(out_t)s)a, (%(out_t)s)b)") else: - tmpl = "div = (%(out_t)s)a / (%(out_t)s)b," \ - "mod = a %% b" + tmpl = ("div = (%(out_t)s)a / (%(out_t)s)b," + "mod = a %% b") ksrc = tmpl % {'out_t': dtype_to_ctype(odtype)} diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 72fa6d4fb8..7899e5cac1 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1374,8 +1374,8 @@ cdef GpuArray pygpu_empty_like(GpuArray a, ga_order ord, int typecode): cdef GpuArray res if ord == GA_ANY_ORDER: - if py_CHKFLAGS(a, GA_F_CONTIGUOUS) and \ - not py_CHKFLAGS(a, GA_C_CONTIGUOUS): + if (py_CHKFLAGS(a, GA_F_CONTIGUOUS) and + not py_CHKFLAGS(a, GA_C_CONTIGUOUS)): ord = GA_F_ORDER else: ord = GA_C_ORDER @@ -1688,8 +1688,10 @@ cdef class GpuArray: """ if not np.PyArray_ISBEHAVED(dst): raise ValueError, "Destination Numpy array is not well behaved: aligned and writeable" - if not ((self.flags.c_contiguous and self.flags.aligned and dst.flags['C_CONTIGUOUS']) or \ - (self.flags.f_contiguous and self.flags.aligned and dst.flags['F_CONTIGUOUS'])): + if (not ((self.flags.c_contiguous and self.flags.aligned and + dst.flags['C_CONTIGUOUS']) or + (self.flags.f_contiguous and self.flags.aligned and + dst.flags['F_CONTIGUOUS']))): raise ValueError, "GpuArray and Numpy array do not match in contiguity or GpuArray is not aligned" if self.dtype != dst.dtype: raise ValueError, "GpuArray and Numpy array do not have matching data types" @@ -2033,13 +2035,13 @@ cdef class GpuArray: # is also required for numpy compat. el = key.index(Ellipsis) if isinstance(key, tuple): - key = key[:el] + \ - (Ellipsis,)*(self.ga.nd - (len(key) - 1)) + \ - key[el+1:] + key = (key[:el] + + (Ellipsis,)*(self.ga.nd - (len(key) - 1)) + + key[el+1:]) else: - key = key[:el] + \ - [Ellipsis,]*(self.ga.nd - (len(key) - 1)) + \ - key[el+1:] + key = (key[:el] + + [Ellipsis,]*(self.ga.nd - (len(key) - 1)) + + key[el+1:]) if len(key) > self.ga.nd: raise IndexError, "too many indices" for i in range(0, len(key)): diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index 112c2a5ef6..c945fdf2de 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -49,16 +49,16 @@ def test_gemv(): bools = [False, True] for shape, order, trans, offseted_i, sliced in product( [(100, 128), (128, 50)], 'fc', bools, bools, [1, 2, -1, -2]): - yield gemv, shape, 'float32', order, trans, \ - offseted_i, sliced, True, False + yield (gemv, shape, 'float32', order, trans, + offseted_i, sliced, True, False) for overwrite, init_y in product(bools, bools): - yield gemv, (4, 3), 'float32', 'f', False, False, 1, \ - overwrite, init_y + yield (gemv, (4, 3), 'float32', 'f', False, False, 1, + overwrite, init_y) yield gemv, (32, 32), 'float64', 'f', False, False, 1, True, False for alpha, beta, overwrite in product( [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): - yield gemv, (32, 32), 'float32', 'f', False, False, 1, \ - overwrite, True, alpha, beta + yield (gemv, (32, 32), 'float32', 'f', False, False, 1, + overwrite, True, alpha, beta) @guard_devsup @@ -96,17 +96,17 @@ def test_gemm(): for (m, n, k), order, trans, offseted_o in product( [(48, 15, 32), (15, 32, 48)], list(product(*['fc']*3)), list(product(bools, bools)), bools): - yield gemm, m, n, k, 'float32', order, trans, \ - offseted_o, 1, False, False + yield (gemm, m, n, k, 'float32', order, trans, + offseted_o, 1, False, False) for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools): - yield gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), \ - (False, False), False, sliced, overwrite, init_res - yield gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), \ - False, 1, False, False + yield (gemm, 4, 3, 2, 'float32', ('f', 'f', 'f'), + (False, False), False, sliced, overwrite, init_res) + yield (gemm, 32, 32, 32, 'float64', ('f', 'f', 'f'), (False, False), + False, 1, False, False) for alpha, beta, overwrite in product( [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): - yield gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), \ - (False, False), False, 1, overwrite, True, alpha, beta + yield (gemm, 32, 23, 32, 'float32', ('f', 'f', 'f'), + (False, False), False, 1, overwrite, True, alpha, beta) @guard_devsup @@ -179,17 +179,17 @@ def test_rgemmBatch_3d(): [1, 17, 31], [(24, 7, 16), (7, 16, 24)], list(product('fc', 'fc', 'c')), list(product(bools, bools)), bools): - yield rgemmBatch_3d, b, m, n, k, 'float32', order, trans, \ - offseted_o, 1, False, False + yield (rgemmBatch_3d, b, m, n, k, 'float32', order, trans, + offseted_o, 1, False, False) for sliced, overwrite, init_res in product([1, 2, -1, -2], bools, bools): - yield rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'c'), \ - (False, False), False, sliced, overwrite, init_res - yield rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'c'), \ - (False, False), False, 1, False, False + yield (rgemmBatch_3d, 5, 4, 3, 2, 'float32', ('f', 'f', 'c'), + (False, False), False, sliced, overwrite, init_res) + yield (rgemmBatch_3d, 16, 16, 16, 16, 'float64', ('f', 'f', 'c'), + (False, False), False, 1, False, False) for alpha, beta, overwrite in product( [0, 1, -1, 0.6], [0, 1, -1, 0.6], bools): - yield rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'c'), \ - (False, False), False, 1, overwrite, True, alpha, beta + yield (rgemmBatch_3d, 16, 16, 9, 16, 'float32', ('f', 'f', 'c'), + (False, False), False, 1, overwrite, True, alpha, beta) @guard_devsup diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index de5a3e1f4a..562371a2fe 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -49,8 +49,8 @@ def test_hash(): def test_bool(): for data in [numpy.empty((0, 33)), [[1]], [[0]], [], [0], [1], 0, 1]: - assert bool(pygpu.asarray(data, context=ctx)) == \ - bool(numpy.asarray(data)) + assert (bool(pygpu.asarray(data, context=ctx)) == + bool(numpy.asarray(data))) def test_transfer(): @@ -143,8 +143,8 @@ def test_ascontiguousarray(): for offseted_i in [True, True]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: - yield ascontiguousarray, shp, dtype, offseted_o, \ - offseted_i, sliced, order + yield (ascontiguousarray, shp, dtype, offseted_o, + offseted_i, sliced, order) @guard_devsup @@ -179,8 +179,8 @@ def test_asfortranarray(): for offseted_inner in [True, False]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: - yield asfortranarray, shp, dtype, offseted_outer, \ - offseted_inner, sliced, order + yield (asfortranarray, shp, dtype, offseted_outer, + offseted_inner, sliced, order) @guard_devsup @@ -416,8 +416,8 @@ def test_copy_view(): def check_memory_region(a, a_op, b, b_op): - assert numpy.may_share_memory(a, a_op) == \ - pygpu.gpuarray.may_share_memory(b, b_op) + assert (numpy.may_share_memory(a, a_op) == + pygpu.gpuarray.may_share_memory(b, b_op)) @guard_devsup @@ -527,8 +527,8 @@ def test_transpose(): for sliced in [1, 2, -2, -1]: yield transpose, shp, offseted, sliced, order for perm in permutations(list(range(len(shp)))): - yield transpose_perm, shp, perm, offseted, sliced, \ - order + yield (transpose_perm, shp, perm, offseted, sliced, + order) def transpose(shp, offseted, sliced, order): From 2932294566480a48c254dc7c8e21ae3a563909ec Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 14 Jun 2017 16:49:54 -0400 Subject: [PATCH 376/597] Remove non-functional abstraction --- src/gpuarray_buffer_cuda.c | 1 - src/gpuarray_buffer_opencl.c | 1 - 2 files changed, 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 6cbf1a0da8..8f6e2cdb6f 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -444,7 +444,6 @@ static const char CUDA_PREAMBLE[] = "#define GLOBAL_MEM /* empty */\n" "#define LOCAL_MEM __shared__\n" "#define LOCAL_MEM_ARG /* empty */\n" - "#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y, Z)\n" "#ifdef NAN\n" "#undef NAN\n" "#endif\n" diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index b49cf445aa..7c45208717 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -288,7 +288,6 @@ static const char CL_PREAMBLE[] = "#define GLOBAL_MEM __global\n" "#define LOCAL_MEM __local\n" "#define LOCAL_MEM_ARG __local\n" - "#define REQD_WG_SIZE(x, y, z) __attribute__((reqd_work_group_size(x, y, z)))\n" "#ifndef NULL\n" " #define NULL ((void*)0)\n" "#endif\n" From ee1ff9b4ff660af4e1a6cd237fb600953c75f588 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 20 Jun 2017 16:16:32 -0400 Subject: [PATCH 377/597] Changes for release 0.6.6 --- doc/conf.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 0a63642eb5..cc98bac7d9 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -59,7 +59,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.5' +release = '0.6.6' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index dd059fcb16..f1e8ec6639 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ MAJOR = 0 MINOR = 6 -PATCH = 5 +PATCH = 6 SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) From 37907f2620cc18c50d8e8c8506b2aa9f67598b61 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 21 Jun 2017 09:51:20 -0400 Subject: [PATCH 378/597] Fix broken pygpu.test(). --- pygpu/tests/main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pygpu/tests/main.py b/pygpu/tests/main.py index 57499a5c8c..6d0c7c960f 100644 --- a/pygpu/tests/main.py +++ b/pygpu/tests/main.py @@ -1,8 +1,9 @@ import os +import nose.plugins.builtin from nose.config import Config from nose.plugins.manager import PluginManager -from numpy.testing.nosetester import import_nose, NoseTester +from numpy.testing.nosetester import NoseTester from numpy.testing.noseclasses import KnownFailure, NumpyTestProgram @@ -50,9 +51,6 @@ def prepare_test_args(self, verbose=1, extra_argv=None, coverage=False, Takes the same arguments as `test`. """ - # fail with nice error message if nose is not present - nose = import_nose() - # compile argv argv = self._test_argv(verbose, extra_argv) From e15d70be34d07fb1c28b031b752dffe87c3360ba Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 21 Jun 2017 11:07:21 -0400 Subject: [PATCH 379/597] Changes for release 0.6.7 --- doc/conf.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index cc98bac7d9..d3178534bc 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -59,7 +59,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.6' +release = '0.6.7' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index f1e8ec6639..10ae097e25 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ MAJOR = 0 MINOR = 6 -PATCH = 6 +PATCH = 7 SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) From d66fb24cdb359f13428cb17f8bbbf34fa3e97f79 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 22 Jun 2017 08:49:40 -0400 Subject: [PATCH 380/597] Fix windows compilation issue: add inline definition for MSVC 2008 in utils.h. --- src/gpuarray/util.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/gpuarray/util.h b/src/gpuarray/util.h index a8a58ca4d3..26e43d8aa0 100644 --- a/src/gpuarray/util.h +++ b/src/gpuarray/util.h @@ -15,6 +15,13 @@ extern "C" { #include #include +/* MSVC 2008 does not support "inline". */ +#ifdef _MSC_VER +#ifndef inline +#define inline __inline +#endif +#endif + /** * Registers a type with the kernel machinery. * From c09aba7cb9b3236e3540fefbbd2e6ec1b0bb993c Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 22 Jun 2017 10:36:54 -0400 Subject: [PATCH 381/597] Factorize code for Windows compatibility. --- src/CMakeLists.txt | 1 + src/gpuarray/array.h | 6 ------ src/gpuarray/config.h | 1 + src/gpuarray/util.h | 7 ------- src/gpuarray/wincompat/util.h | 20 ++++++++++++++++++++ src/gpuarray_buffer_opencl.c | 4 ---- src/util/error.h | 7 ------- 7 files changed, 22 insertions(+), 24 deletions(-) create mode 100644 src/gpuarray/wincompat/util.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a45db024ff..bbb1a36b0f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -132,6 +132,7 @@ install(FILES ${headers} DESTINATION include/gpuarray) if(NOT UNIX) install(FILES gpuarray/wincompat/stdint.h DESTINATION include/gpuarray/wincompat) + install(FILES gpuarray/wincompat/util.h DESTINATION include/gpuarray/wincompat) endif() install(TARGETS gpuarray gpuarray-static diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index a7aa899559..7c659964cd 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -8,12 +8,6 @@ #include #include -#ifdef _MSC_VER -#ifndef inline -#define inline __inline -#endif -#endif - #ifdef __cplusplus extern "C" { #endif diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index 571f81cfe6..a5c3cbdb4c 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -32,6 +32,7 @@ #endif #define ssize_t intptr_t #define SSIZE_MAX INTPTR_MAX +#include #else #include #include diff --git a/src/gpuarray/util.h b/src/gpuarray/util.h index 26e43d8aa0..a8a58ca4d3 100644 --- a/src/gpuarray/util.h +++ b/src/gpuarray/util.h @@ -15,13 +15,6 @@ extern "C" { #include #include -/* MSVC 2008 does not support "inline". */ -#ifdef _MSC_VER -#ifndef inline -#define inline __inline -#endif -#endif - /** * Registers a type with the kernel machinery. * diff --git a/src/gpuarray/wincompat/util.h b/src/gpuarray/wincompat/util.h new file mode 100644 index 0000000000..94cf4e4a98 --- /dev/null +++ b/src/gpuarray/wincompat/util.h @@ -0,0 +1,20 @@ +#ifndef WINCOMPAT_UTIL +#define WINCOMPAT_UTIL + +#ifdef _MSC_VER + /* MSVC 2008 does not support "inline". */ + #ifndef inline + #define inline __inline + #endif + #ifndef snprintf + #define snprintf _snprintf + #endif + #ifndef strdup + #define strdup _strdup + #endif + #ifndef alloca + #define alloca _alloca + #endif +#endif + +#endif diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 7c45208717..5d38aa90f5 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -15,10 +15,6 @@ #include "loaders/libclblas.h" #include "loaders/libclblast.h" -#ifdef _MSC_VER -#define strdup _strdup -#endif - #define _unused(x) ((void)x) #define SSIZE_MIN (-(SSIZE_MAX-1)) diff --git a/src/util/error.h b/src/util/error.h index b7a50fc6a8..fc1ecb1663 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -6,13 +6,6 @@ #include -/* MSVC 2008 does not support "inline". */ -#ifdef _MSC_VER -#ifndef inline -#define inline __inline -#endif -#endif - /* 1024 - 4 for the int that goes after */ #define ERROR_MSGBUF_LEN 1020 From 488f2c0bfdc08515ad5433fc7461e063bddc3fa9 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Thu, 22 Jun 2017 13:12:53 -0400 Subject: [PATCH 382/597] Update private_config.h.in --- src/private_config.h.in | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/private_config.h.in b/src/private_config.h.in index f58a03edae..597ea3e735 100644 --- a/src/private_config.h.in +++ b/src/private_config.h.in @@ -19,15 +19,6 @@ extern "C" { } #endif -#ifdef _MSC_VER -/* God damn Microsoft ... */ -#define snprintf _snprintf -#define strdup _strdup -/* MS VC++ 2008 does not support inline */ -#define inline __inline -#define alloca _alloca -#endif - #ifdef _MSC_VER #define SPREFIX "I" #else From 05110add0da055ef4f684adf62e9afdd1f8fdd7c Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 26 Jun 2017 08:06:43 -0400 Subject: [PATCH 383/597] Put "inline" definition in public config. Let private definitions in private config. --- src/CMakeLists.txt | 1 - src/gpuarray/config.h | 4 +++- src/gpuarray/wincompat/util.h | 20 -------------------- src/private_config.h.in | 7 +++++++ 4 files changed, 10 insertions(+), 22 deletions(-) delete mode 100644 src/gpuarray/wincompat/util.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bbb1a36b0f..a45db024ff 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -132,7 +132,6 @@ install(FILES ${headers} DESTINATION include/gpuarray) if(NOT UNIX) install(FILES gpuarray/wincompat/stdint.h DESTINATION include/gpuarray/wincompat) - install(FILES gpuarray/wincompat/util.h DESTINATION include/gpuarray/wincompat) endif() install(TARGETS gpuarray gpuarray-static diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index a5c3cbdb4c..fb452cc2be 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -25,6 +25,9 @@ #ifdef _MSC_VER #include +#ifndef inline +#define inline __inline +#endif #if _MSC_VER < 1600 #include #else @@ -32,7 +35,6 @@ #endif #define ssize_t intptr_t #define SSIZE_MAX INTPTR_MAX -#include #else #include #include diff --git a/src/gpuarray/wincompat/util.h b/src/gpuarray/wincompat/util.h deleted file mode 100644 index 94cf4e4a98..0000000000 --- a/src/gpuarray/wincompat/util.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef WINCOMPAT_UTIL -#define WINCOMPAT_UTIL - -#ifdef _MSC_VER - /* MSVC 2008 does not support "inline". */ - #ifndef inline - #define inline __inline - #endif - #ifndef snprintf - #define snprintf _snprintf - #endif - #ifndef strdup - #define strdup _strdup - #endif - #ifndef alloca - #define alloca _alloca - #endif -#endif - -#endif diff --git a/src/private_config.h.in b/src/private_config.h.in index 597ea3e735..ff03831203 100644 --- a/src/private_config.h.in +++ b/src/private_config.h.in @@ -19,6 +19,13 @@ extern "C" { } #endif +#ifdef _MSC_VER +/* God damn Microsoft ... */ +#define snprintf _snprintf +#define strdup _strdup +#define alloca _alloca +#endif + #ifdef _MSC_VER #define SPREFIX "I" #else From cdeab7b611ee0d8239c8e8d8d303de1ba9677d16 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 26 Jun 2017 12:40:45 -0400 Subject: [PATCH 384/597] Make the build happen on appveyor. --- .appveyor.yml | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .appveyor.yml diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 0000000000..8aad386df7 --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,39 @@ +version: '0.6.7.{build}' + +image: Visual Studio 2015 + +init: + - git config --global core.autocrlf input + - cmd: cmake --version + - cmd: msbuild /version + +platform: + - x64 + +clone_folder: C:\projects\libgpuarray + +configuration: + - Release + - Debug + +environment: + matrix: + - PYTHON: "C:\\Python27" + VS_PATH: "C:\\Users\\appveyor\\AppData\\Local\\Programs\\Common\\Microsoft\\Visual C++ for Python\\9.0" + - PYTHON: "C:\\Python35" + VS_PATH: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC" + +build_script: + - echo "Python:" "%PYTHON%" + - echo "Config:" "%CONFIGURATION%" + - echo "VS path:" "%VS_PATH%" + - cd "%VS_PATH%" + - vcvarsall x64 + - set + - cd C:\projects\libgpuarray + - md %CONFIGURATION% + - cd %CONFIGURATION% + - cmake .. -DCMAKE_BUILD_TYPE=%CONFIGURATION% -G "NMake Makefiles" + - cmake --build . --config %CONFIGURATION% + +build: script From 6debab1e9ed23a268ab4255a79eacf22994c3422 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 26 Jun 2017 14:31:49 -0400 Subject: [PATCH 385/597] New version location. --- release.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/release.txt b/release.txt index 72e850cccc..a90f758d39 100644 --- a/release.txt +++ b/release.txt @@ -2,6 +2,7 @@ Release process: - Make sure you are on the proper release branch - Update the version in setup.py - Update the version in doc/conf.py +- Update the version in .appveyor.yml - Commit the changes with message "Changes for release X.Y.Z" git commit -m "Changes for release X.Y.Z" - Make a git tag From 345b3f3b9371c2c158795df852d8aef98b3a3170 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 26 Jun 2017 14:47:06 -0400 Subject: [PATCH 386/597] Don't increment build number for PRs. --- .appveyor.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 8aad386df7..bb00b631c3 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,5 +1,8 @@ version: '0.6.7.{build}' +pull_requests: + do_not_increment_build_number: true + image: Visual Studio 2015 init: From eb3127b86af0486bc275431b8d7d102019cab5ec Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 27 Jun 2017 10:53:33 -0400 Subject: [PATCH 387/597] Changes for release 0.6.8 --- .appveyor.yml | 2 +- doc/conf.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index bb00b631c3..26d5064e23 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: '0.6.7.{build}' +version: '0.6.8.{build}' pull_requests: do_not_increment_build_number: true diff --git a/doc/conf.py b/doc/conf.py index d3178534bc..58dba54436 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -59,7 +59,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.7' +release = '0.6.8' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 10ae097e25..f890ed7189 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ MAJOR = 0 MINOR = 6 -PATCH = 7 +PATCH = 8 SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) From 113f91974fb0cd6b8f3123201e0c3923d86eff50 Mon Sep 17 00:00:00 2001 From: Deepali Chourasia Date: Thu, 6 Jul 2017 08:27:07 -0500 Subject: [PATCH 388/597] Added fix to detect 'check' version using pkg-config. Also modified the fall-back code to - check for the availability of the required libraries: lpthread, lrt and lm - link to these libraries. --- tests/CMakeLists.txt | 50 ++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 74cf17f7b2..ada91c409a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,33 +1,47 @@ include(CheckCSourceCompiles) +include(CheckLibraryExists) find_package(PkgConfig) pkg_search_module(CHECK check) -if(NOT CHECK_FOUND) - +if(CHECK_FOUND) + if(CHECK_VERSION VERSION_LESS 0.10.0) + MESSAGE( "Check version older than 0.10.0" ) + set(CHECK_FOUND 0) + endif() +else() find_path(CHECK_INCLUDE_DIRS check.h) find_library(CHECK_LIBRARIES NAMES check) - if(CHECK_INCLUDE_DIRS AND CHECK_LIBRARIES) set(CHECK_CFLAGS) set(CHECK_LIBRARY_DIRS) set(CHECK_FOUND 1) endif() - -endif() - -if(CHECK_FOUND) - set(CMAKE_REQUIRED_FLAGS ${CHECK_C_FLAGS} ${CHECK_LDFLAGS_OTHERS}) - set(CMAKE_REQUIRED_INCLUDES ${CHECK_INCLUDE_DIRS}) - set(CMAKE_REQUIRED_LIBRARIES ${CHECK_LIBRARIES}) - CHECK_C_SOURCE_COMPILES( - "#include - int main() { - ck_assert_ptr_ne(NULL, NULL); - }" - CHECK_FUNCS) - if (NOT CHECK_FUNCS) - set(CHECK_FOUND 0) + if(CHECK_FOUND) + set(CMAKE_REQUIRED_FLAGS ${CHECK_C_FLAGS} ${CHECK_LDFLAGS_OTHERS}) + set(CMAKE_REQUIRED_INCLUDES ${CHECK_INCLUDE_DIRS}) + CHECK_LIBRARY_EXISTS(pthread pthread_create "" HAVE_PTHREAD) + if (HAVE_PTHREAD) + set(CHECK_LIBRARIES ${CHECK_LIBRARIES} pthread) + endif (HAVE_PTHREAD) + CHECK_LIBRARY_EXISTS(rt nanosleep "" HAVE_LIBRT) + if (HAVE_LIBRT) + set(CHECK_LIBRARIES ${CHECK_LIBRARIES} rt) + endif (HAVE_LIBRT) + CHECK_LIBRARY_EXISTS(m cos "" HAVE_LIBM) + if (HAVE_LIBM) + set(CHECK_LIBRARIES ${CHECK_LIBRARIES} m) + endif (HAVE_LIBM) + set(CMAKE_REQUIRED_LIBRARIES ${CHECK_LIBRARIES}) + CHECK_C_SOURCE_COMPILES( + "#include + int main() { + ck_assert_ptr_ne(NULL, NULL); + }" + CHECK_FUNCS) + if (NOT CHECK_FUNCS) + set(CHECK_FOUND 0) + endif() endif() endif() From 9456cf1f8ea7f301b940e9dd736d61768fe15142 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Fri, 7 Jul 2017 11:40:22 -0400 Subject: [PATCH 389/597] Loop over supported CUDA versions to find installed CUDA on Windows and Mac. --- src/gpuarray_buffer_cuda.c | 39 +++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 8f6e2cdb6f..ba169eefb7 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -132,6 +132,7 @@ static int setup_lib(void) { const char *ver; CUresult err; int res, tmp; + int search_version = 0; if (!setup_done) { res = load_libcuda(global_err); @@ -147,13 +148,49 @@ static int setup_lib(void) { return error_set(global_err, GA_IMPL_ERROR, "cuDriverGetVersion failed"); major = tmp / 1000; minor = (tmp / 10) % 10; + #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(__APPLE__) + /* We will dynamically search the right CUDA version only on Windows and Macintosh systems, + and only if user has not explicitely specified GPUARRAY_CUDA_VERSION. */ + search_version = 1; + #endif } else { major = ver[0] - '0'; minor = ver[1] - '0'; } + /* NB: next line will cause problems if a CUDA 10.0 (or 9.11) is released in the future. */ if (major > 9 || major < 0 || minor > 9 || minor < 0) return error_fmt(global_err, GA_VALUE_ERROR, "Invalid cuda version: %d.%d", major, minor); - res = load_libnvrtc(major, minor, global_err); + if (!search_version) { + res = load_libnvrtc(major, minor, global_err); + } else { + /* First case in next array is reserved to eventually receive the version returned by cuDriverGetVersion(). */ + int versions[] = {-1, 80, 75}; + int versions_length = sizeof(versions) / sizeof(int); + int current_version = major * 10 + minor; + int i = 0; + for (i = 1; i < versions_length && versions[i] != current_version; ++i); + if (i == versions_length) { + /* Current version not found in the list of versions. We add it at top of the list. */ + versions[0] = current_version; + /* We will iterate on versions from the first. */ + i = 0; + } else { + /* Current version found in the list of known versions. No need to add it to the list. */ + i = 1; + }; + do { + major = versions[i] / 10; + minor = versions[i] % 10; + res = load_libnvrtc(major, minor, global_err); + ++i; + } while(res != GA_NO_ERROR && i < versions_length); + #ifdef DEBUG + if (res == GA_NO_ERROR) + fprintf(stderr, "Detected CUDA %d.%d.\n", major, minor); + else + fprintf(stderr, "Unable to detect a CUDA version.\n"); + #endif + } if (res != GA_NO_ERROR) return res; setup_done = 1; From c36923f4481b1525993959709374a0702670a760 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Fri, 7 Jul 2017 11:47:32 -0400 Subject: [PATCH 390/597] Add a debug checking for loaded cublas version. --- src/loaders/libcublas.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index 57c8c0e295..95719bb6d2 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -44,6 +44,9 @@ int load_libcublas(int major, int minor, error *e) { static const char DIGITS[] = "0123456789"; char libname[] = "cublas64_??.dll"; + #ifdef DEBUG + fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); + #endif libname[9] = DIGITS[major]; libname[10] = DIGITS[minor]; From 63d262c1fc68be01092e3dd78874dfe45e922ba3 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Fri, 7 Jul 2017 12:06:38 -0400 Subject: [PATCH 391/597] Fix Windows CI test error. --- src/loaders/libcublas.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index 95719bb6d2..ab4918e4ea 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -1,4 +1,8 @@ #include +#ifdef DEBUG +/* For fprintf and stderr. */ +#include +#endif #include "libcublas.h" #include "dyn_load.h" From b78169e5528a9c46a9b8dd632c3bc9d3bfc88dce Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 10 Jul 2017 10:17:10 -0400 Subject: [PATCH 392/597] Remove support for GPUARRAY_CUDA_VERSION. Try to load detected CUDA by default, else look for supported versions. Add support for >9 versions (e.g. CUDA 10.1). --- src/gpuarray_buffer_cuda.c | 58 +++++++++----------------------------- src/loaders/libcublas.c | 14 ++++----- src/loaders/libnvrtc.c | 21 ++++++++------ 3 files changed, 32 insertions(+), 61 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index ba169eefb7..3fad624a62 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -132,7 +132,6 @@ static int setup_lib(void) { const char *ver; CUresult err; int res, tmp; - int search_version = 0; if (!setup_done) { res = load_libcuda(global_err); @@ -141,55 +140,24 @@ static int setup_lib(void) { err = cuInit(0); if (err != CUDA_SUCCESS) return error_cuda(global_err, "cuInit", err); - ver = getenv("GPUARRAY_CUDA_VERSION"); - if (ver == NULL || strlen(ver) != 2) { - err = cuDriverGetVersion(&tmp); - if (err != CUDA_SUCCESS) - return error_set(global_err, GA_IMPL_ERROR, "cuDriverGetVersion failed"); - major = tmp / 1000; - minor = (tmp / 10) % 10; - #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) || defined(__APPLE__) - /* We will dynamically search the right CUDA version only on Windows and Macintosh systems, - and only if user has not explicitely specified GPUARRAY_CUDA_VERSION. */ - search_version = 1; - #endif - } else { - major = ver[0] - '0'; - minor = ver[1] - '0'; - } - /* NB: next line will cause problems if a CUDA 10.0 (or 9.11) is released in the future. */ - if (major > 9 || major < 0 || minor > 9 || minor < 0) - return error_fmt(global_err, GA_VALUE_ERROR, "Invalid cuda version: %d.%d", major, minor); - if (!search_version) { - res = load_libnvrtc(major, minor, global_err); - } else { - /* First case in next array is reserved to eventually receive the version returned by cuDriverGetVersion(). */ - int versions[] = {-1, 80, 75}; - int versions_length = sizeof(versions) / sizeof(int); - int current_version = major * 10 + minor; + err = cuDriverGetVersion(&tmp); + if (err != CUDA_SUCCESS) + return error_set(global_err, GA_IMPL_ERROR, "cuDriverGetVersion failed"); + major = tmp / 1000; + minor = (tmp / 10) % 10; + /* Let's try to load a nvrtc corresponding to detected CUDA version. */ + res = load_libnvrtc(major, minor, global_err); + if (res != GA_NO_ERROR) { + /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */ + int versions[][2] = {{8, 0}, {7, 5}, {7, 0}}; + int versions_length = sizeof(versions) / (2 * sizeof(int)); int i = 0; - for (i = 1; i < versions_length && versions[i] != current_version; ++i); - if (i == versions_length) { - /* Current version not found in the list of versions. We add it at top of the list. */ - versions[0] = current_version; - /* We will iterate on versions from the first. */ - i = 0; - } else { - /* Current version found in the list of known versions. No need to add it to the list. */ - i = 1; - }; do { - major = versions[i] / 10; - minor = versions[i] % 10; + major = versions[i][0]; + minor = versions[i][1]; res = load_libnvrtc(major, minor, global_err); ++i; } while(res != GA_NO_ERROR && i < versions_length); - #ifdef DEBUG - if (res == GA_NO_ERROR) - fprintf(stderr, "Detected CUDA %d.%d.\n", major, minor); - else - fprintf(stderr, "Unable to detect a CUDA version.\n"); - #endif } if (res != GA_NO_ERROR) return res; diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index ab4918e4ea..f0280e3787 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -45,24 +45,22 @@ int load_libcublas(int major, int minor, error *e) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { - static const char DIGITS[] = "0123456789"; - char libname[] = "cublas64_??.dll"; + const char* libname_pattern = "cublas64_%d%d.dll"; + char libname[64]; #ifdef DEBUG fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); #endif - libname[9] = DIGITS[major]; - libname[10] = DIGITS[minor]; + sprintf(libname, libname_pattern, major, minor); lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ { - static const char DIGITS[] = "0123456789"; - char libname[] = "/Developer/NVIDIA/CUDA-?.?/lib/libcublas.dylib"; - libname[23] = DIGITS[major]; - libname[25] = DIGITS[minor]; + const char* libname_pattern = "/Developer/NVIDIA/CUDA-%d.%d/lib/libcublas.dylib"; + char libname[128]; + sprintf(libname, libname_pattern, major, minor); lib = ga_load_library(libname, e); } #else diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c index fa5cfb2434..d413b28c70 100644 --- a/src/loaders/libnvrtc.c +++ b/src/loaders/libnvrtc.c @@ -1,4 +1,8 @@ #include +#ifdef DEBUG +/* For fprintf and stderr. */ +#include +#endif #include "libcuda.h" #include "libnvrtc.h" @@ -27,22 +31,23 @@ int load_libnvrtc(int major, int minor, error *e) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { - static const char DIGITS[] = "0123456789"; - char libname[] = "nvrtc64_??.dll"; + const char* libname_pattern = "nvrtc64_%d%d.dll"; + char libname[64]; - libname[8] = DIGITS[major]; - libname[9] = DIGITS[minor]; + #ifdef DEBUG + fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor); + #endif + sprintf(libname, libname_pattern, major, minor); lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ { - static const char DIGITS[] = "0123456789"; /* Try the usual fullpath first */ - char libname[] = "/Developer/NVIDIA/CUDA-?.?/lib/libnvrtc.dylib"; - libname[23] = DIGITS[major]; - libname[25] = DIGITS[minor]; + const char* libname_pattern = "/Developer/NVIDIA/CUDA-%d.%d/lib/libnvrtc.dylib"; + char libname[128]; + sprintf(libname, libname_pattern, major, minor); lib = ga_load_library(libname, e); } #else From 99bb1078a5fb773058ecfc369da55935026a6f14 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 10 Jul 2017 13:53:44 -0400 Subject: [PATCH 393/597] Use sNprintf to build libnames and check errors. --- src/gpuarray_buffer_cuda.c | 2 +- src/loaders/libcublas.c | 24 +++++++++++++----------- src/loaders/libnvrtc.c | 23 +++++++++++++---------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 3fad624a62..bfecf94ff1 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -150,7 +150,7 @@ static int setup_lib(void) { if (res != GA_NO_ERROR) { /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */ int versions[][2] = {{8, 0}, {7, 5}, {7, 0}}; - int versions_length = sizeof(versions) / (2 * sizeof(int)); + int versions_length = sizeof(versions) / sizeof(versions[0]); int i = 0; do { major = versions[i][0]; diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index f0280e3787..bd04cc3197 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -1,8 +1,5 @@ -#include -#ifdef DEBUG -/* For fprintf and stderr. */ -#include -#endif +/* To be able to use snprintf with any compiler including MSVC2008. */ +#include #include "libcublas.h" #include "dyn_load.h" @@ -45,22 +42,27 @@ int load_libcublas(int major, int minor, error *e) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { - const char* libname_pattern = "cublas64_%d%d.dll"; char libname[64]; - + int n; #ifdef DEBUG fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); #endif - sprintf(libname, libname_pattern, major, minor); - + n = snprintf(libname, 64, "cublas64_%d%d.dll", major, minor); + if (n < 0 || n >= 64) + return error_set(e, GA_SYS_ERROR, "cublas library name too long."); lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ { - const char* libname_pattern = "/Developer/NVIDIA/CUDA-%d.%d/lib/libcublas.dylib"; char libname[128]; - sprintf(libname, libname_pattern, major, minor); + int n; + #ifdef DEBUG + fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); + #endif + n = snprintf(libname, 128, "/Developer/NVIDIA/CUDA-%d.%d/lib/libcublas.dylib", major, minor); + if (n < 0 || n >= 128) + return error_set(e, GA_SYS_ERROR, "cublas library path too long."); lib = ga_load_library(libname, e); } #else diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c index d413b28c70..9c7d101dff 100644 --- a/src/loaders/libnvrtc.c +++ b/src/loaders/libnvrtc.c @@ -1,8 +1,5 @@ -#include -#ifdef DEBUG -/* For fprintf and stderr. */ -#include -#endif +/* To be able to use snprintf with any compiler including MSVC2008. */ +#include #include "libcuda.h" #include "libnvrtc.h" @@ -31,13 +28,14 @@ int load_libnvrtc(int major, int minor, error *e) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) { - const char* libname_pattern = "nvrtc64_%d%d.dll"; char libname[64]; - + int n; #ifdef DEBUG fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor); #endif - sprintf(libname, libname_pattern, major, minor); + n = snprintf(libname, 64, "nvrtc64_%d%d.dll", major, minor); + if (n < 0 || n >= 64) + return error_set(e, GA_SYS_ERROR, "nvrtc library name too long."); lib = ga_load_library(libname, e); } @@ -45,9 +43,14 @@ int load_libnvrtc(int major, int minor, error *e) { #ifdef __APPLE__ { /* Try the usual fullpath first */ - const char* libname_pattern = "/Developer/NVIDIA/CUDA-%d.%d/lib/libnvrtc.dylib"; char libname[128]; - sprintf(libname, libname_pattern, major, minor); + int n; + #ifdef DEBUG + fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor); + #endif + n = snprintf(libname, 128, "/Developer/NVIDIA/CUDA-%d.%d/lib/libnvrtc.dylib", major, minor); + if (n < 0 || n >= 128) + return error_set(e, GA_SYS_ERROR, "nvrtc library path too long."); lib = ga_load_library(libname, e); } #else From e97823a7bd5ba22bc653a5c21d4051444b388950 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 10 Jul 2017 14:25:14 -0400 Subject: [PATCH 394/597] Fix last style issues. --- src/gpuarray_buffer_cuda.c | 8 +++++--- src/loaders/libcublas.c | 12 ++++++------ src/loaders/libnvrtc.c | 13 ++++++------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index bfecf94ff1..1dc60dd388 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -129,7 +129,6 @@ static int setup_done = 0; static int major = -1; static int minor = -1; static int setup_lib(void) { - const char *ver; CUresult err; int res, tmp; @@ -152,12 +151,15 @@ static int setup_lib(void) { int versions[][2] = {{8, 0}, {7, 5}, {7, 0}}; int versions_length = sizeof(versions) / sizeof(versions[0]); int i = 0; + /* Skip versions that are higher or equal to the driver version */ + while (versions[i][0] > major || + (versions[i][0] == major && versions[i][1] >= minor)) i++; do { major = versions[i][0]; minor = versions[i][1]; res = load_libnvrtc(major, minor, global_err); - ++i; - } while(res != GA_NO_ERROR && i < versions_length); + i++; + } while (res != GA_NO_ERROR && i < versions_length); } if (res != GA_NO_ERROR) return res; diff --git a/src/loaders/libcublas.c b/src/loaders/libcublas.c index bd04cc3197..b810f10bc2 100644 --- a/src/loaders/libcublas.c +++ b/src/loaders/libcublas.c @@ -47,9 +47,9 @@ int load_libcublas(int major, int minor, error *e) { #ifdef DEBUG fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); #endif - n = snprintf(libname, 64, "cublas64_%d%d.dll", major, minor); - if (n < 0 || n >= 64) - return error_set(e, GA_SYS_ERROR, "cublas library name too long."); + n = snprintf(libname, sizeof(libname), "cublas64_%d%d.dll", major, minor); + if (n < 0 || n >= sizeof(libname)) + return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else /* Unix */ @@ -60,9 +60,9 @@ int load_libcublas(int major, int minor, error *e) { #ifdef DEBUG fprintf(stderr, "Loading cuBLAS %d.%d.\n", major, minor); #endif - n = snprintf(libname, 128, "/Developer/NVIDIA/CUDA-%d.%d/lib/libcublas.dylib", major, minor); - if (n < 0 || n >= 128) - return error_set(e, GA_SYS_ERROR, "cublas library path too long."); + n = snprintf(libname, sizeof(libname), "/Developer/NVIDIA/CUDA-%d.%d/lib/libcublas.dylib", major, minor); + if (n < 0 || n >= sizeof(libname)) + return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else diff --git a/src/loaders/libnvrtc.c b/src/loaders/libnvrtc.c index 9c7d101dff..ef052b79c3 100644 --- a/src/loaders/libnvrtc.c +++ b/src/loaders/libnvrtc.c @@ -33,24 +33,23 @@ int load_libnvrtc(int major, int minor, error *e) { #ifdef DEBUG fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor); #endif - n = snprintf(libname, 64, "nvrtc64_%d%d.dll", major, minor); - if (n < 0 || n >= 64) - return error_set(e, GA_SYS_ERROR, "nvrtc library name too long."); + n = snprintf(libname, sizeof(libname), "nvrtc64_%d%d.dll", major, minor); + if (n < 0 || n >= sizeof(libname)) + return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else /* Unix */ #ifdef __APPLE__ { - /* Try the usual fullpath first */ char libname[128]; int n; #ifdef DEBUG fprintf(stderr, "Loading nvrtc %d.%d.\n", major, minor); #endif - n = snprintf(libname, 128, "/Developer/NVIDIA/CUDA-%d.%d/lib/libnvrtc.dylib", major, minor); - if (n < 0 || n >= 128) - return error_set(e, GA_SYS_ERROR, "nvrtc library path too long."); + n = snprintf(libname, sizeof(libname), "/Developer/NVIDIA/CUDA-%d.%d/lib/libnvrtc.dylib", major, minor); + if (n < 0 || n >= sizeof(libname)) + return error_set(e, GA_SYS_ERROR, "snprintf"); lib = ga_load_library(libname, e); } #else From 0624de224cf065c1d42f8dcbae226da4cdf7ccc8 Mon Sep 17 00:00:00 2001 From: Deepali Chourasia Date: Tue, 18 Jul 2017 11:43:09 +0530 Subject: [PATCH 395/597] Add make uninstall for cmake. --- CMakeLists.txt | 9 +++++++++ cmake_uninstall.cmake.in | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 cmake_uninstall.cmake.in diff --git a/CMakeLists.txt b/CMakeLists.txt index d6a96e7339..6de35048f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,3 +28,12 @@ endif() add_subdirectory(src) add_subdirectory(tests) + +# uninstall target +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" + IMMEDIATE @ONLY) + +add_custom_target(uninstall + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) diff --git a/cmake_uninstall.cmake.in b/cmake_uninstall.cmake.in new file mode 100644 index 0000000000..2037e36539 --- /dev/null +++ b/cmake_uninstall.cmake.in @@ -0,0 +1,21 @@ +if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") + message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") +endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") + +file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) +string(REGEX REPLACE "\n" ";" files "${files}") +foreach(file ${files}) + message(STATUS "Uninstalling $ENV{DESTDIR}${file}") + if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") + exec_program( + "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" + OUTPUT_VARIABLE rm_out + RETURN_VALUE rm_retval + ) + if(NOT "${rm_retval}" STREQUAL 0) + message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") + endif(NOT "${rm_retval}" STREQUAL 0) + else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") + message(STATUS "File $ENV{DESTDIR}${file} does not exist.") + endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") +endforeach(file) From e47d52634971101cc01d0d4a63b1edc097fc2757 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 18 Jul 2017 18:20:09 -0400 Subject: [PATCH 396/597] Rename kernel_key to disk_key. --- src/gpuarray_buffer_cuda.c | 42 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 1dc60dd388..273cb99866 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -47,7 +47,7 @@ static int cuda_records(gpudata *, int, CUstream); static int detect_arch(const char *prefix, char *ret, error *e); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); -typedef struct _kernel_key { +typedef struct _disk_key { uint8_t version; uint8_t debug; uint8_t major; @@ -55,13 +55,13 @@ typedef struct _kernel_key { uint32_t reserved; char bin_id[64]; strb src; -} kernel_key; +} disk_key; -/* Size of the kernel_key that we can memcopy to duplicate */ -#define KERNEL_KEY_MM (sizeof(kernel_key) - sizeof(strb)) +/* Size of the disk_key that we can memcopy to duplicate */ +#define DISK_KEY_MM (sizeof(disk_key) - sizeof(strb)) static void key_free(cache_key_t _k) { - kernel_key *k = (kernel_key *)_k; + disk_key *k = (disk_key *)_k; strb_clear(&k->src); free(k); } @@ -75,41 +75,41 @@ static uint32_t strb_hash(strb *k) { return XXH32(k->s, k->l, 42); } -static int key_eq(kernel_key *k1, kernel_key *k2) { - return (memcmp(k1, k2, KERNEL_KEY_MM) == 0 && +static int key_eq(disk_key *k1, disk_key *k2) { + return (memcmp(k1, k2, DISK_KEY_MM) == 0 && strb_eq(&k1->src, &k2->src)); } -static int key_hash(kernel_key *k) { +static int key_hash(disk_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); - XXH32_update(&state, k, KERNEL_KEY_MM); + XXH32_update(&state, k, DISK_KEY_MM); XXH32_update(&state, k->src.s, k->src.l); return XXH32_digest(&state); } -static int key_write(strb *res, kernel_key *k) { - strb_appendn(res, (const char *)k, KERNEL_KEY_MM); +static int key_write(strb *res, disk_key *k) { + strb_appendn(res, (const char *)k, DISK_KEY_MM); strb_appendb(res, &k->src); return strb_error(res); } -static kernel_key *key_read(const strb *b) { - kernel_key *k; - if (b->l < KERNEL_KEY_MM) return NULL; +static disk_key *key_read(const strb *b) { + disk_key *k; + if (b->l < DISK_KEY_MM) return NULL; k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; - memcpy(k, b->s, KERNEL_KEY_MM); + memcpy(k, b->s, DISK_KEY_MM); if (k->version != 0) { free(k); return NULL; } - if (strb_ensure(&k->src, b->l - KERNEL_KEY_MM) != 0) { + if (strb_ensure(&k->src, b->l - DISK_KEY_MM) != 0) { strb_clear(&k->src); free(k); return NULL; } - strb_appendn(&k->src, b->s + KERNEL_KEY_MM, b->l - KERNEL_KEY_MM); + strb_appendn(&k->src, b->s + DISK_KEY_MM, b->l - DISK_KEY_MM); return k; } @@ -1190,8 +1190,8 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { strb ptx = STRB_STATIC_INIT; strb *cbin; - kernel_key k; - kernel_key *pk; + disk_key k; + disk_key *pk; memset(&k, 0, sizeof(k)); k.version = 0; @@ -1217,14 +1217,14 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { GA_CHECK(make_bin(ctx, &ptx, bin, log)); if (ctx->disk_cache) { - pk = calloc(sizeof(kernel_key), 1); + pk = calloc(sizeof(disk_key), 1); if (pk == NULL) { error_sys(ctx->err, "calloc"); fprintf(stderr, "Error adding kernel to disk cache: %s\n", ctx->err->msg); return GA_NO_ERROR; } - memcpy(pk, &k, KERNEL_KEY_MM); + memcpy(pk, &k, DISK_KEY_MM); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { error_sys(ctx->err, "strb_appendb"); From fee0445ab20d9ac9d287518d9aadb8a6d6cced88 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 18 Jul 2017 18:39:03 -0400 Subject: [PATCH 397/597] Fix the in-memory kernel cache to not ignore function names. --- src/gpuarray_buffer_cuda.c | 86 +++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 273cb99866..071ec9700d 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -57,10 +57,15 @@ typedef struct _disk_key { strb src; } disk_key; +typedef struct _kernel_key { + const char *fname; + strb src; +} kernel_key; + /* Size of the disk_key that we can memcopy to duplicate */ #define DISK_KEY_MM (sizeof(disk_key) - sizeof(strb)) -static void key_free(cache_key_t _k) { +static void disk_free(cache_key_t _k) { disk_key *k = (disk_key *)_k; strb_clear(&k->src); free(k); @@ -71,16 +76,31 @@ static int strb_eq(strb *k1, strb *k2) { memcmp(k1->s, k2->s, k1->l) == 0); } -static uint32_t strb_hash(strb *k) { - return XXH32(k->s, k->l, 42); +static int kernel_eq(kernel_key *k1, kernel_key *k2) { + return (strcmp(k1->fname, k2->fname) == 0 && + strb_eq(&k1->src, &k2->src)); +} + +static uint32_t kernel_hash(kernel_key *k) { + XXH32_state_t state; + XXH32_reset(&state, 42); + XXH32_update(&state, k->fname, strlen(k->fname)); + XXH32_update(&state, k->src.s, k->src.l); + return XXH32_digest(&state); } -static int key_eq(disk_key *k1, disk_key *k2) { +static void kernel_free(kernel_key *k) { + free((void *)k->fname); + strb_clear(&k->src); + free(k); +} + +static int disk_eq(disk_key *k1, disk_key *k2) { return (memcmp(k1, k2, DISK_KEY_MM) == 0 && strb_eq(&k1->src, &k2->src)); } -static int key_hash(disk_key *k) { +static int disk_hash(disk_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); XXH32_update(&state, k, DISK_KEY_MM); @@ -88,13 +108,13 @@ static int key_hash(disk_key *k) { return XXH32_digest(&state); } -static int key_write(strb *res, disk_key *k) { +static int disk_write(strb *res, disk_key *k) { strb_appendn(res, (const char *)k, DISK_KEY_MM); strb_appendb(res, &k->src); return strb_error(res); } -static disk_key *key_read(const strb *b) { +static disk_key *disk_read(const strb *b) { disk_key *k; if (b->l < DISK_KEY_MM) return NULL; k = calloc(1, sizeof(*k)); @@ -238,9 +258,9 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { } res->kernel_cache = cache_twoq(64, 128, 64, 8, - (cache_eq_fn)strb_eq, - (cache_hash_fn)strb_hash, - (cache_freek_fn)strb_free, + (cache_eq_fn)kernel_eq, + (cache_hash_fn)kernel_hash, + (cache_freek_fn)kernel_free, (cache_freev_fn)cuda_freekernel, global_err); if (res->kernel_cache == NULL) { error_cuda(global_err, "cuStreamCreate", err); @@ -250,9 +270,9 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { cache_path = getenv("GPUARRAY_CACHE_PATH"); if (cache_path != NULL) { mem_cache = cache_lru(64, 8, - (cache_eq_fn)key_eq, - (cache_hash_fn)key_hash, - (cache_freek_fn)key_free, + (cache_eq_fn)disk_eq, + (cache_hash_fn)disk_hash, + (cache_freek_fn)disk_free, (cache_freev_fn)strb_free, global_err); if (mem_cache == NULL) { @@ -261,11 +281,11 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { goto fail_disk_cache; } res->disk_cache = cache_disk(cache_path, mem_cache, - (kwrite_fn)key_write, + (kwrite_fn)disk_write, (vwrite_fn)kernel_write, - (kread_fn)key_read, + (kread_fn)disk_read, (vread_fn)kernel_read, - res->err); + global_err); if (res->disk_cache == NULL) { fprintf(stderr, "Error initializing disk cache, disabling: %s\n", global_err->msg); @@ -1230,7 +1250,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { error_sys(ctx->err, "strb_appendb"); fprintf(stderr, "Error adding kernel to disk cache %s\n", ctx->err->msg); - key_free((cache_key_t)pk); + disk_free((cache_key_t)pk); return GA_NO_ERROR; } cbin = strb_alloc(bin->l); @@ -1238,7 +1258,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { error_sys(ctx->err, "strb_alloc"); fprintf(stderr, "Error adding kernel to disk cache: %s\n", ctx->err->msg); - key_free((cache_key_t)pk); + disk_free((cache_key_t)pk); return GA_NO_ERROR; } strb_appendb(cbin, bin); @@ -1246,7 +1266,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { error_sys(ctx->err, "strb_appendb"); fprintf(stderr, "Error adding kernel to disk cache %s\n", ctx->err->msg); - key_free((cache_key_t)pk); + disk_free((cache_key_t)pk); strb_free(cbin); return GA_NO_ERROR; } @@ -1284,8 +1304,9 @@ static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, strb src = STRB_STATIC_INIT; strb bin = STRB_STATIC_INIT; strb log = STRB_STATIC_INIT; - strb *psrc; gpukernel *res; + kernel_key k_key; + kernel_key *p_key; CUdevice dev; CUresult err; unsigned int i; @@ -1350,7 +1371,10 @@ static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, return error_sys(ctx->err, "strb"); } - res = (gpukernel *)cache_get(ctx->kernel_cache, &src); + k_key.fname = fname; + k_key.src = src; + + res = (gpukernel *)cache_get(ctx->kernel_cache, &k_key); if (res != NULL) { res->refcnt++; strb_clear(&src); @@ -1434,13 +1458,19 @@ static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); - psrc = memdup(&src, sizeof(strb)); - if (psrc != NULL) { - /* One of the refs is for the cache */ - res->refcnt++; - /* If this fails, it will free the key and remove a ref from the - kernel. */ - cache_add(ctx->kernel_cache, psrc, res); + p_key = memdup(&k_key, sizeof(kernel_key)); + if (p_key != NULL) { + p_key->fname = strdup(fname); + if (p_key->fname != NULL) { + /* One of the refs is for the cache */ + res->refcnt++; + /* If this fails, it will free the key and remove a ref from the + kernel. */ + cache_add(ctx->kernel_cache, p_key, res); + } else { + free(p_key); + strb_clear(&src); + } } else { strb_clear(&src); } From bbe1916a87f2d8e4322b60decf4114e46af1797f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 19 Jul 2017 10:03:43 -0400 Subject: [PATCH 398/597] Fix posible segfault by returning NULL on error. --- src/gpuarray_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 3f143d1da9..42531de380 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -162,7 +162,7 @@ gpukernel *gpukernel_init(gpucontext *ctx, unsigned int count, const char *fname, unsigned int numargs, const int *typecodes, int flags, int *ret, char **err_str) { - gpukernel *res; + gpukernel *res = NULL; int err; err = ctx->ops->kernel_alloc(&res, ctx, count, strings, lengths, fname, numargs, typecodes, flags, err_str); From d159801b8187e23c8272bf7c7e7bc4ba434cfda3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 19 Jul 2017 10:04:20 -0400 Subject: [PATCH 399/597] Fix warnings by over-eager compiler. --- src/gpuarray_elemwise.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index dc1d837d3b..8921141cff 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -740,12 +740,12 @@ void GpuElemwise_free(GpuElemwise *ge) { } int GpuElemwise_call(GpuElemwise *ge, void **args, int flags) { - size_t n; - size_t *dims; - ssize_t **strides; - unsigned int nd; - int contig; - int call32; + size_t n = 0; + size_t *dims = NULL; + ssize_t **strides = NULL; + unsigned int nd = 0; + int contig = 0; + int call32 = 0; int err; err = check_contig(ge, args, &n, &contig); From 8d11429ccb21a68dc6d52219b71f939de4b5264b Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 19 Jul 2017 12:19:49 -0400 Subject: [PATCH 400/597] Fix #481. Enable the __int128 code path only on machines where it's defined. --- src/util/integerfactoring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/integerfactoring.c b/src/util/integerfactoring.c index cb785bb080..c398cdc211 100644 --- a/src/util/integerfactoring.c +++ b/src/util/integerfactoring.c @@ -266,7 +266,7 @@ static uint64_t gaIMulMod (uint64_t a, uint64_t b, uint64_t m){ ); return r; -#elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#elif ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ >= 16 /* Hardcore GCC 4.6+ optimization jazz */ return ((unsigned __int128)a * (unsigned __int128)b) % m; #else From 2ee706477384874cd48c0befc6625c2b33c71d81 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 19 Jul 2017 12:21:49 -0400 Subject: [PATCH 401/597] Muzzle -Wsign-compare warning. --- src/gpuarray_buffer_opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 5d38aa90f5..edcba5eee7 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -757,7 +757,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { offset, n, val); } /* If this assert fires, increase the size of local_kern above. */ - assert(r <= sizeof(local_kern)); + assert(r <= (int)sizeof(local_kern)); _unused(r); sz = strlen(local_kern); From 430dc93a0f882ecd7ed30f3d523334e3406f29c5 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 19 Jul 2017 12:29:45 -0400 Subject: [PATCH 402/597] Muzzle -Wshift-count-overflow warning. This involves the s |= s >> 32 business when size_t is 32 bits. Change roundup2() to use unsigned long long unconditionally. We can't use uint64_t because MSVC doesn't have it, but unsigned long long is 64 bits on all platforms we care about. --- src/cache/lru.c | 5 ++--- src/cache/twoq.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/cache/lru.c b/src/cache/lru.c index cf1b9555db..a17cf5cf6c 100644 --- a/src/cache/lru.c +++ b/src/cache/lru.c @@ -114,15 +114,14 @@ struct _hash { size_t size; }; -static inline size_t roundup2(size_t s) { +static inline unsigned long long roundup2(unsigned long long s) { s--; s |= s >> 1; s |= s >> 2; s |= s >> 4; s |= s >> 8; s |= s >> 16; - if (sizeof(size_t) >= 8) - s |= s >> 32; + s |= s >> 32; s++; return s; } diff --git a/src/cache/twoq.c b/src/cache/twoq.c index 4f14e557b7..ea33be0b63 100644 --- a/src/cache/twoq.c +++ b/src/cache/twoq.c @@ -124,15 +124,14 @@ struct _hash { size_t size; }; -static inline size_t roundup2(size_t s) { +static inline unsigned long long roundup2(unsigned long long s) { s--; s |= s >> 1; s |= s >> 2; s |= s >> 4; s |= s >> 8; s |= s >> 16; - if (sizeof(size_t) >= 8) - s |= s >> 32; + s |= s >> 32; s++; return s; } From 97c40a2a8b5a051ab195f24d11b3df075f5ffb5f Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 19 Jul 2017 12:43:39 -0400 Subject: [PATCH 403/597] Muzzle -O2 -Wmaybe-uninitialized. This results from the inlining of call_basic() into GpuElemwise_call(), and the compiler's inability to verify that 1) The various "uninitialized" values will be used only if err == GA_NO_ERROR 2) The above-mentioned values are indeed initialized when err == GA_NO_ERROR. --- src/gpuarray_elemwise.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index dc1d837d3b..8921141cff 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -740,12 +740,12 @@ void GpuElemwise_free(GpuElemwise *ge) { } int GpuElemwise_call(GpuElemwise *ge, void **args, int flags) { - size_t n; - size_t *dims; - ssize_t **strides; - unsigned int nd; - int contig; - int call32; + size_t n = 0; + size_t *dims = NULL; + ssize_t **strides = NULL; + unsigned int nd = 0; + int contig = 0; + int call32 = 0; int err; err = check_contig(ge, args, &n, &contig); From e2a43c5f8f583a9d8b64400099d305b25b7347c2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 20 Jul 2017 16:48:58 -0400 Subject: [PATCH 404/597] Avoid a segfault the the copy errors out. --- src/gpuarray_array.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index fccb6c569d..4b4a76e9a8 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -643,7 +643,8 @@ int GpuArray_reshape(GpuArray *res, const GpuArray *a, unsigned int nd, err = GpuArray_reshape_inplace(res, nd, newdims, ord); if (err == GA_COPY_ERROR && !nocopy) { GpuArray_clear(res); - GpuArray_copy(res, a, ord); + err = GpuArray_copy(res, a, ord); + if (err != GA_NO_ERROR) return err; err = GpuArray_reshape_inplace(res, nd, newdims, ord); } if (err != GA_NO_ERROR) GpuArray_clear(res); From 04234d23bde95bb9865b391f49119184bc803037 Mon Sep 17 00:00:00 2001 From: Simon Lefrancois Date: Thu, 20 Jul 2017 11:58:11 -0400 Subject: [PATCH 405/597] add pr mac --- .jenkins_pr_mac.sh | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100755 .jenkins_pr_mac.sh diff --git a/.jenkins_pr_mac.sh b/.jenkins_pr_mac.sh new file mode 100755 index 0000000000..0c91673696 --- /dev/null +++ b/.jenkins_pr_mac.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Script for Jenkins continuous integration testing of libgpuarray on mac + +# Print commands as they are executed +set -x + +# Set path for conda and cmake +export PATH="/Users/jenkins/miniconda2/bin:/usr/local/bin:$PATH" + +# CUDA +export PATH=/usr/local/cuda/bin:${PATH} +export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:${DYLD_LIBRARY_PATH} +export CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH} + +# Can also set to "Debug", "Release" to go faster +: ${GPUARRAY_CONFIG:="Release"} +# Set these to " " to disable (empty doesn't work) +: ${DEVICES_CUDA:="cuda"} # for multiple devices use "cuda0 cuda1" +: ${DEVICES_OPENCL:=" "} + +git rev-parse HEAD + +# Build libgpuarray and run C tests +rm -rf build +mkdir build +(cd build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} && make) + +# Test on different devices +for dev in ${DEVICES_CUDA}; do + echo "Testing libgpuarray for DEVICE=${dev}" + (cd build && DEVICE=${dev} make test) +done +for dev in ${DEVICES_OPENCL}; do + echo "Testing libgpuarray for DEVICE=${dev}" + (cd build && DEVICE=${dev} make test) +done + +export PYTHONPATH=`pwd`/lib/python:$PYTHONPATH +export DYLD_LIBRARY_PATH=`pwd`/lib:${DYLD_LIBRARY_PATH} +export CPLUS_INCLUDE_PATH=`pwd`/src:${CPLUS_INCLUDE_PATH} + +# Build the pygpu modules +python setup.py build_ext --inplace -L`pwd`/lib -I`pwd`/src + +# Test it +test=pygpu_pr_mac +for dev in ${DEVICES_CUDA}; do + echo "Testing pygpu for DEVICE=${dev}" + DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests +done +for dev in ${DEVICES_OPENCL}; do + echo "Testing pygpu for DEVICE=${dev}" + DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests -e test_blas.py +done From f3253d62394642c3cc99815c065b6dc7483f67b7 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 23 Jul 2017 01:10:16 -0400 Subject: [PATCH 406/597] Muzzle -Wdeclaration-after-statement in tests/main.c. --- tests/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/main.c b/tests/main.c index eba94b4f13..1012373521 100644 --- a/tests/main.c +++ b/tests/main.c @@ -15,6 +15,10 @@ extern Suite *get_suite(void); int main(int argc, char *argv[]) { + int number_failed; + Suite *s; + SRunner *sr; + #ifdef TEST_COLLECTIVES MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_ndev); @@ -29,9 +33,8 @@ int main(int argc, char *argv[]) dev_name = argv[comm_rank + 1]; // Set a gpu for this process. #endif // TEST_COLLECTIVES - int number_failed; - Suite *s = get_suite(); - SRunner *sr = srunner_create(s); + s = get_suite(); + sr = srunner_create(s); #ifdef TEST_COLLECTIVES // Check by default forks to another (non mpi registered) process in order to // run tests. Using MPI inside tests means we must disable this. From 3a0d3bcf7a702e666f4cc894cdd0000bca81ecee Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 23 Jul 2017 01:13:58 -0400 Subject: [PATCH 407/597] Muzzle -Wdeclaration-after-statement in tests/communicator.c --- tests/communicator.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/communicator.c b/tests/communicator.c index d70aac1cd9..696e04fc86 100644 --- a/tests/communicator.c +++ b/tests/communicator.c @@ -21,11 +21,13 @@ extern void teardown(void); */ void setup_comm(void) { - setup(); int err; + gpucommCliqueId comm_id; + + setup(); MPI_Barrier(MPI_COMM_WORLD); - gpucommCliqueId comm_id; + err = gpucomm_gen_clique_id(ctx, &comm_id); // Has successfully got a unique comm id. ck_assert_int_eq(err, GA_NO_ERROR); From 68c1ecd93b0274d7f71e3bf2f451f8b2f1534459 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 23 Jul 2017 01:25:00 -0400 Subject: [PATCH 408/597] Muzzle -Wdeclaration-after-statement in check*_collectives.c --- tests/check_buffer_collectives.c | 122 +++++++++++++++++++------------ tests/check_collectives.c | 38 +++++----- 2 files changed, 95 insertions(+), 65 deletions(-) diff --git a/tests/check_buffer_collectives.c b/tests/check_buffer_collectives.c index 76f4e1bdb3..10c1f41a7c 100644 --- a/tests/check_buffer_collectives.c +++ b/tests/check_buffer_collectives.c @@ -28,9 +28,9 @@ extern void teardown_comm(void); #define ABS_DIFF(a, b) fabs((double)(b - a)) #define MAX_ABS_DIFF(A, B, N, res) \ do { \ - res = 0; \ double locdelta; \ int loci; \ + res = 0; \ for (loci = 0; loci < N; ++loci) { \ locdelta = ABS_DIFF(A[loci], B[loci]); \ if (locdelta > res) \ @@ -42,8 +42,8 @@ typedef unsigned long ulong; #define PRINTV(ar, N, t) \ do { \ - printf("%s\n", STR(ar)); \ int li; \ + printf("%s\n", STR(ar)); \ for (li = 0; li < (N); ++li) { \ printf(STR(t) " ", ar[li]); \ } \ @@ -81,18 +81,21 @@ END_TEST #define INIT_ARRAYS(insize, outsize) \ int err; \ - void* Av = calloc((insize), sizeof(char)); \ + void* Av, * RESv, * EXPv; \ + gpudata* Adev, *RESdev; \ + \ + Av = calloc((insize), sizeof(char)); \ if (Av == NULL) \ ck_abort_msg("system memory allocation failed"); \ - void* RESv = calloc((outsize), sizeof(char)); \ + RESv = calloc((outsize), sizeof(char)); \ if (RESv == NULL) \ ck_abort_msg("system memory allocation failed"); \ - void* EXPv = calloc((outsize), sizeof(char)); \ + EXPv = calloc((outsize), sizeof(char)); \ if (EXPv == NULL) \ ck_abort_msg("system memory allocation failed"); \ - gpudata* Adev = gpudata_alloc(ctx, (insize), NULL, 0, &err); \ + Adev = gpudata_alloc(ctx, (insize), NULL, 0, &err); \ ck_assert_ptr_ne(Adev, NULL); \ - gpudata* RESdev = gpudata_alloc(ctx, (outsize), NULL, 0, &err); \ + RESdev = gpudata_alloc(ctx, (outsize), NULL, 0, &err); \ ck_assert_ptr_ne(RESdev, NULL); #define DESTROY_ARRAYS() \ @@ -104,13 +107,15 @@ END_TEST #define TEST_REDUCE(systype, gatype, mpitype, coloptype, epsilon, print) \ START_TEST(test_gpucomm_reduce_##gatype##_##coloptype) { \ + systype* A, * RES, * EXP; \ + int i, count; \ INIT_ARRAYS(SIZE, SIZE) \ \ - systype* A = (systype*)Av; \ - systype* RES = (systype*)RESv; \ - systype* EXP = (systype*)EXPv; \ + A = (systype*)Av; \ + RES = (systype*)RESv; \ + EXP = (systype*)EXPv; \ \ - int i, count = SIZE / sizeof(systype); \ + count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) \ A[i] = comm_rank + 2; \ err = gpudata_write(Adev, 0, A, SIZE); \ @@ -128,9 +133,9 @@ END_TEST "openmpi error: cannot produced expected"); \ \ if (comm_rank == ROOT_RANK) { \ + systype res; \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ - systype res; \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ @@ -197,13 +202,16 @@ TEST_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, #define TEST_ALL_REDUCE(systype, gatype, mpitype, coloptype, epsilon, print) \ START_TEST(test_gpucomm_all_reduce_##gatype##_##coloptype) { \ + systype* A, * RES, * EXP; \ + systype res; \ + int i, count; \ INIT_ARRAYS(SIZE, SIZE) \ \ - systype* A = (systype*)Av; \ - systype* RES = (systype*)RESv; \ - systype* EXP = (systype*)EXPv; \ + A = (systype*)Av; \ + RES = (systype*)RESv; \ + EXP = (systype*)EXPv; \ \ - int i, count = SIZE / sizeof(systype); \ + count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) \ A[i] = comm_rank + 2; \ err = gpudata_write(Adev, 0, A, SIZE); \ @@ -222,7 +230,6 @@ TEST_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ - systype res; \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ @@ -294,26 +301,31 @@ TEST_ALL_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, #define TEST_REDUCE_SCATTER(systype, gatype, mpitype, coloptype, epsilon, \ print) \ START_TEST(test_gpucomm_reduce_scatter_##gatype##_##coloptype) { \ + systype* A, * RES, * EXP; \ + systype res; \ + int i, count; \ + int recvcount; \ + int* recvcounts; \ INIT_ARRAYS(SIZE, SIZE / comm_ndev) \ \ - systype* A = (systype*)Av; \ - systype* RES = (systype*)RESv; \ - systype* EXP = (systype*)EXPv; \ + A = (systype*)Av; \ + RES = (systype*)RESv; \ + EXP = (systype*)EXPv; \ \ - int i, count = SIZE / sizeof(systype); \ + count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) \ A[i] = comm_rank + 2; \ err = gpudata_write(Adev, 0, A, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ \ - int recvcount = count / comm_ndev; \ + recvcount = count / comm_ndev; \ err = gpucomm_reduce_scatter(Adev, 0, RESdev, 0, recvcount, GA_##gatype, \ GA_##coloptype, comm); \ ck_assert_int_eq(err, GA_NO_ERROR); \ gpudata_sync(RESdev); \ gpudata_sync(Adev); \ \ - int* recvcounts = (int*)malloc(comm_ndev * sizeof(int)); \ + recvcounts = (int*)malloc(comm_ndev * sizeof(int)); \ if (recvcounts == NULL) \ ck_abort_msg("system memory allocation failed"); \ for (i = 0; i < comm_ndev; ++i) \ @@ -326,7 +338,6 @@ TEST_ALL_REDUCE_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, \ err = gpudata_read(RES, RESdev, 0, SIZE / comm_ndev); \ ck_assert_int_eq(err, GA_NO_ERROR); \ - systype res; \ MAX_ABS_DIFF(RES, EXP, recvcount, res); \ if (!(res <= epsilon)) { \ print(RES, recvcount); \ @@ -396,12 +407,15 @@ TEST_REDUCE_SCATTER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, #define TEST_BROADCAST(systype, gatype, mpitype, epsilon, print) \ START_TEST(test_gpucomm_broadcast_##gatype) { \ + systype* RES, * EXP; \ + systype res; \ + int i, count; \ INIT_ARRAYS(SIZE, SIZE) \ \ - systype* RES = (systype*)RESv; \ - systype* EXP = (systype*)EXPv; \ + RES = (systype*)RESv; \ + EXP = (systype*)EXPv; \ \ - int i, count = SIZE / sizeof(systype); \ + count = SIZE / sizeof(systype); \ for (i = 0; i < count; ++i) { \ RES[i] = comm_rank + 1; \ EXP[i] = RES[i]; \ @@ -419,7 +433,6 @@ TEST_REDUCE_SCATTER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, GA_SUM, 0, 0, \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ - systype res; \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ @@ -463,14 +476,17 @@ TEST_BROADCAST_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, #define TEST_ALL_GATHER(systype, gatype, mpitype, epsilon, print) \ START_TEST(test_gpucomm_all_gather_##gatype) { \ + systype* A, * RES, * EXP; \ + systype res; \ + int i, count, sendcount; \ INIT_ARRAYS(SIZE / comm_ndev, SIZE) \ \ - systype* A = (systype*)Av; \ - systype* RES = (systype*)RESv; \ - systype* EXP = (systype*)EXPv; \ + A = (systype*)Av; \ + RES = (systype*)RESv; \ + EXP = (systype*)EXPv; \ \ - int i, count = SIZE / sizeof(systype); \ - int sendcount = count / comm_ndev; \ + count = SIZE / sizeof(systype); \ + sendcount = count / comm_ndev; \ for (i = 0; i < sendcount; ++i) \ A[i] = comm_rank + 1; \ err = gpudata_write(Adev, 0, A, SIZE / comm_ndev); \ @@ -489,7 +505,6 @@ TEST_BROADCAST_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, \ err = gpudata_read(RES, RESdev, 0, SIZE); \ ck_assert_int_eq(err, GA_NO_ERROR); \ - systype res; \ MAX_ABS_DIFF(RES, EXP, count, res); \ if (!(res <= epsilon)) { \ print(RES, count); \ @@ -536,14 +551,27 @@ TEST_ALL_GATHER_FAIL(elemcount, (size_t)INT_MAX + 1, GA_INT, 0, 0, GA_XLARGE_ERROR) Suite* get_suite(void) { - Suite* s = suite_create("buffer_collectives_API"); - - TCase* helps = tcase_create("test_helpers"); + Suite* s; + TCase* helps; + TCase* reds; + TCase* redf; + TCase* areds; + TCase* aredf; + TCase* redscs; + TCase* redscf; + TCase* bcasts; + TCase* bcastf; + TCase* agats; + TCase* agatf; + + s = suite_create("buffer_collectives_API"); + + helps = tcase_create("test_helpers"); tcase_add_unchecked_fixture(helps, setup_comm, teardown_comm); tcase_add_test(helps, test_gpucomm_get_count); tcase_add_test(helps, test_gpucomm_get_rank); - TCase* reds = tcase_create("test_reduce"); + reds = tcase_create("test_reduce"); tcase_add_unchecked_fixture(reds, setup_comm, teardown_comm); tcase_add_test(reds, test_gpucomm_reduce_INT_SUM); tcase_add_test(reds, test_gpucomm_reduce_INT_PROD); @@ -570,14 +598,14 @@ Suite* get_suite(void) { tcase_add_test(reds, test_gpucomm_reduce_ULONG_MAX); tcase_add_test(reds, test_gpucomm_reduce_ULONG_MIN); - TCase* redf = tcase_create("test_reduce_fail"); + redf = tcase_create("test_reduce_fail"); tcase_add_unchecked_fixture(redf, setup_comm, teardown_comm); tcase_add_test(redf, test_gpucomm_reduce_fail_datatype); tcase_add_test(redf, test_gpucomm_reduce_fail_optype); tcase_add_test(redf, test_gpucomm_reduce_fail_src_offset); tcase_add_test(redf, test_gpucomm_reduce_fail_elemcount); - TCase* areds = tcase_create("test_all_reduce"); + areds = tcase_create("test_all_reduce"); tcase_add_unchecked_fixture(areds, setup_comm, teardown_comm); tcase_add_test(areds, test_gpucomm_all_reduce_INT_SUM); tcase_add_test(areds, test_gpucomm_all_reduce_INT_PROD); @@ -604,7 +632,7 @@ Suite* get_suite(void) { tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_MAX); tcase_add_test(areds, test_gpucomm_all_reduce_ULONG_MIN); - TCase* aredf = tcase_create("test_all_reduce_fail"); + aredf = tcase_create("test_all_reduce_fail"); tcase_add_unchecked_fixture(aredf, setup_comm, teardown_comm); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_datatype); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_optype); @@ -612,7 +640,7 @@ Suite* get_suite(void) { tcase_add_test(aredf, test_gpucomm_all_reduce_fail_dest_offset); tcase_add_test(aredf, test_gpucomm_all_reduce_fail_elemcount); - TCase* redscs = tcase_create("test_reduce_scatter"); + redscs = tcase_create("test_reduce_scatter"); tcase_add_unchecked_fixture(redscs, setup_comm, teardown_comm); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_SUM); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_PROD); @@ -639,7 +667,7 @@ Suite* get_suite(void) { tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_MAX); tcase_add_test(redscs, test_gpucomm_reduce_scatter_ULONG_MIN); - TCase* redscf = tcase_create("test_reduce_scatter_fail"); + redscf = tcase_create("test_reduce_scatter_fail"); tcase_add_unchecked_fixture(redscf, setup_comm, teardown_comm); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_datatype); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_optype); @@ -647,7 +675,7 @@ Suite* get_suite(void) { tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_dest_offset); tcase_add_test(redscf, test_gpucomm_reduce_scatter_fail_elemcount); - TCase* bcasts = tcase_create("test_broadcast"); + bcasts = tcase_create("test_broadcast"); tcase_add_unchecked_fixture(bcasts, setup_comm, teardown_comm); tcase_add_test(bcasts, test_gpucomm_broadcast_INT); tcase_add_test(bcasts, test_gpucomm_broadcast_BYTE); @@ -656,13 +684,13 @@ Suite* get_suite(void) { tcase_add_test(bcasts, test_gpucomm_broadcast_LONG); tcase_add_test(bcasts, test_gpucomm_broadcast_ULONG); - TCase* bcastf = tcase_create("test_broadcast_fail"); + bcastf = tcase_create("test_broadcast_fail"); tcase_add_unchecked_fixture(bcastf, setup_comm, teardown_comm); tcase_add_test(bcastf, test_gpucomm_broadcast_fail_datatype); tcase_add_test(bcastf, test_gpucomm_broadcast_fail_src_offset); tcase_add_test(bcastf, test_gpucomm_broadcast_fail_elemcount); - TCase* agats = tcase_create("test_all_gather"); + agats = tcase_create("test_all_gather"); tcase_add_unchecked_fixture(agats, setup_comm, teardown_comm); tcase_add_test(agats, test_gpucomm_all_gather_INT); tcase_add_test(agats, test_gpucomm_all_gather_BYTE); @@ -671,7 +699,7 @@ Suite* get_suite(void) { tcase_add_test(agats, test_gpucomm_all_gather_LONG); tcase_add_test(agats, test_gpucomm_all_gather_ULONG); - TCase* agatf = tcase_create("test_all_gather_fail"); + agatf = tcase_create("test_all_gather_fail"); tcase_add_unchecked_fixture(agatf, setup_comm, teardown_comm); tcase_add_test(agatf, test_gpucomm_all_gather_fail_datatype); tcase_add_test(agatf, test_gpucomm_all_gather_fail_src_offset); diff --git a/tests/check_collectives.c b/tests/check_collectives.c index 492f920d0f..4868f48fc9 100644 --- a/tests/check_collectives.c +++ b/tests/check_collectives.c @@ -29,8 +29,8 @@ extern void teardown_comm(void); #define _STR(x) #x #define COUNT_ERRORS(A, B, M, N, res) \ do { \ - res = 0; \ int loci, locj; \ + res = 0; \ for (loci = 0; loci < (M); ++loci) { \ for (locj = 0; locj < (N); ++locj) { \ if ((A)[loci][locj] != (B)[loci][locj]) \ @@ -45,37 +45,38 @@ extern void teardown_comm(void); #define INIT_ARRAYS(inrows, incols, outrows, outcols) \ int(*A)[(incols)]; \ + int(*RES)[(outcols)]; \ + int(*EXP)[(outcols)]; \ + size_t indims[ND]; \ + size_t outdims[ND]; \ + const ssize_t instrds[ND] = {sizeof(*A), sizeof(int)}; \ + const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)}; \ + int err; \ + size_t i, j, outsize; \ + GpuArray Adev; \ + GpuArray RESdev; \ + \ A = (int(*)[(incols)])calloc((inrows), sizeof(*A)); \ if (A == NULL) \ ck_abort_msg("system memory allocation failed"); \ - int(*RES)[(outcols)]; \ RES = (int(*)[(outcols)])calloc((outrows), sizeof(*RES)); \ if (RES == NULL) \ ck_abort_msg("system memory allocation failed"); \ - int(*EXP)[(outcols)]; \ EXP = (int(*)[(outcols)])calloc((outrows), sizeof(*EXP)); \ if (EXP == NULL) \ ck_abort_msg("system memory allocation failed"); \ - size_t indims[ND]; \ indims[0] = (inrows); \ indims[1] = (incols); \ - size_t outdims[ND]; \ outdims[0] = (outrows); \ outdims[1] = (outcols); \ - const ssize_t instrds[ND] = {sizeof(*A), sizeof(int)}; \ - const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)}; \ - size_t outsize = outdims[0] * outstrds[0]; \ + outsize = outdims[0] * outstrds[0]; \ \ - size_t i, j; \ for (i = 0; i < indims[0]; ++i) \ for (j = 0; j < indims[1]; ++j) \ A[i][j] = comm_rank + 2; \ \ - int err; \ - GpuArray Adev; \ err = GpuArray_copy_from_host(&Adev, ctx, A, GA_INT, ND, indims, instrds); \ ck_assert_int_eq(err, GA_NO_ERROR); \ - GpuArray RESdev; \ err = GpuArray_empty(&RESdev, ctx, GA_INT, ND, outdims, GA_C_ORDER); \ ck_assert_int_eq(err, GA_NO_ERROR); @@ -91,6 +92,7 @@ extern void teardown_comm(void); * aligned`. */ START_TEST(test_GpuArray_reduce) { + int res; INIT_ARRAYS(ROWS, COLS, ROWS, COLS); if (comm_rank == ROOT_RANK) { @@ -111,7 +113,6 @@ START_TEST(test_GpuArray_reduce) { if (comm_rank == ROOT_RANK) { err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); - int res; COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_reduce with %s op produced errors in %d places", @@ -128,6 +129,7 @@ END_TEST * aligned`. */ START_TEST(test_GpuArray_all_reduce) { + int res; INIT_ARRAYS(ROWS, COLS, ROWS, COLS); err = GpuArray_all_reduce(&Adev, &RESdev, GA_SUM, comm); @@ -140,7 +142,6 @@ START_TEST(test_GpuArray_all_reduce) { err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); - int res; COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_all_reduce with %s op produced errors in %d places", @@ -155,6 +156,8 @@ END_TEST * aligned`. */ START_TEST(test_GpuArray_reduce_scatter) { + int res; + int* recvcounts; // In order for C contiguous arrays to be combined/split successfully they // should // split along the smallest axis (the one with the bigger stride). @@ -165,7 +168,7 @@ START_TEST(test_GpuArray_reduce_scatter) { GpuArray_sync(&RESdev); GpuArray_sync(&Adev); - int* recvcounts = (int*)malloc(comm_ndev * sizeof(int)); + recvcounts = (int*)malloc(comm_ndev * sizeof(int)); if (recvcounts == NULL) ck_abort_msg("system memory allocation failed"); for (i = 0; i < (size_t)comm_ndev; ++i) @@ -177,7 +180,6 @@ START_TEST(test_GpuArray_reduce_scatter) { err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); - int res; COUNT_ERRORS(RES, EXP, ROWS / comm_ndev, COLS, res); ck_assert_msg( res == 0, @@ -192,6 +194,7 @@ END_TEST * \note Untested for `not aligned`. */ START_TEST(test_GpuArray_broadcast) { + int res; INIT_ARRAYS(ROWS, COLS, ROWS, COLS); for (i = 0; i < indims[0]; ++i) @@ -207,7 +210,6 @@ START_TEST(test_GpuArray_broadcast) { err = GpuArray_read(RES, outsize, &Adev); ck_assert_int_eq(err, GA_NO_ERROR); - int res; COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_broadcast produced errors in %d places", res); @@ -221,6 +223,7 @@ END_TEST * aligned`. */ START_TEST(test_GpuArray_all_gather) { + int res; // In order for C contiguous arrays to be combined/split successfully they // should // split along the smallest axis (the one with the bigger stride). @@ -237,7 +240,6 @@ START_TEST(test_GpuArray_all_gather) { err = GpuArray_read(RES, outsize, &RESdev); ck_assert_int_eq(err, GA_NO_ERROR); - int res; COUNT_ERRORS(RES, EXP, ROWS, COLS, res); ck_assert_msg(res == 0, "GpuArray_all_gather produced errors in %d places", res); From 30942da780ed60d9024aac00a4042839af601820 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 23 Jul 2017 01:44:20 -0400 Subject: [PATCH 409/597] Muzzle -Wdeclaration-after-statement in tests/check_elemwise.c --- tests/check_elemwise.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index af19b57c15..2c8f75092d 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -80,19 +80,18 @@ START_TEST(test_contig_f16) { GpuElemwise *ge; static uint16_t data1[3]; + static uint16_t data2[3]; + uint16_t data3[3] = {0}; + size_t dims[1]; + gpuelemwise_arg args[3] = {{0}}; + void *rargs[3]; + data1[0] = F16[1]; data1[1] = F16[2]; data1[2] = F16[3]; - static uint16_t data2[3]; data2[0] = F16[4]; data2[1] = F16[5]; data2[2] = F16[6]; - uint16_t data3[3] = {0}; - - size_t dims[1]; - - gpuelemwise_arg args[3] = {{0}}; - void *rargs[3]; dims[0] = 3; @@ -243,19 +242,19 @@ START_TEST(test_basic_f16) { GpuElemwise *ge; static uint16_t data1[3]; + static uint16_t data2[3]; + uint16_t data3[3] = {0}; + size_t dims[2]; + gpuelemwise_arg args[3] = {{0}}; + void *rargs[3]; + data1[0] = F16[1]; data1[1] = F16[2]; data1[2] = F16[3]; - static uint16_t data2[3]; data2[0] = F16[4]; data2[1] = F16[5]; data2[2] = F16[6]; - uint16_t data3[3] = {0}; - - size_t dims[2]; - gpuelemwise_arg args[3] = {{0}}; - void *rargs[3]; dims[0] = 1; dims[1] = 3; From 6af86285015b0e0d78f6c2245c056ae4aa141762 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 23 Jul 2017 01:48:17 -0400 Subject: [PATCH 410/597] Muzzle -Wdeclaration-after-statement in tests/check_reduction.c --- tests/check_reduction.c | 49 ++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 5138e5c02d..8844a585c8 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -68,13 +68,14 @@ static double pcgRand01(void){ */ START_TEST(test_reduction){ - pcgSeed(1); - /** * We test here a reduction of some random 3D tensor on the first and * third dimensions. */ + GpuArray gaSrc; + GpuArray gaMax; + GpuArray gaArgmax; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; @@ -93,6 +94,7 @@ START_TEST(test_reduction){ * Initialize source data. */ + pcgSeed(1); for(i=0;i Date: Sun, 23 Jul 2017 02:11:37 -0400 Subject: [PATCH 411/597] Upgrade warning flags on non-MSVC compilers. We go from -Wall to -Wall -Wextra -Wno-unused-parameter -Werror=format-security - Wdeclaration-after-statement -std=gnu89. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6de35048f7..d721e46f85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") if(MSVC) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3") else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror=format-security -Wdeclaration-after-statement -std=gnu89") endif() enable_testing() From f77f3a22f161526dc14aeb08d9f7ad5cd11a5fec Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 24 Jul 2017 15:15:01 -0400 Subject: [PATCH 412/597] Changes for release 0.6.9 --- .appveyor.yml | 2 +- doc/conf.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 26d5064e23..f419bc5480 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: '0.6.8.{build}' +version: '0.6.9.{build}' pull_requests: do_not_increment_build_number: true diff --git a/doc/conf.py b/doc/conf.py index 58dba54436..e8ee873f82 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -59,7 +59,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.8' +release = '0.6.9' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index f890ed7189..c8f4b5e0b0 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ MAJOR = 0 MINOR = 6 -PATCH = 8 +PATCH = 9 SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) From 49b858f3a315f65fbd311f4dbed394e9d1ccc49d Mon Sep 17 00:00:00 2001 From: Simon Lefrancois Date: Tue, 25 Jul 2017 09:07:14 -0400 Subject: [PATCH 413/597] add windows pr --- .jenkins_pr_win.bat | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 .jenkins_pr_win.bat diff --git a/.jenkins_pr_win.bat b/.jenkins_pr_win.bat new file mode 100644 index 0000000000..cbfd9bf130 --- /dev/null +++ b/.jenkins_pr_win.bat @@ -0,0 +1,61 @@ +REM Set path for conda python and cmake +set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\Program Files\CMake\bin + +# Can also set to "Debug", "Release" to go faster +set GPUARRAY_CONFIG="Release" +# Set these to " " to disable (empty doesn't work) +set DEVICES_CUDA="cuda" # for multiple devices use "cuda0 cuda1" +set DEVICES_OPENCL="" + +git rev-parse HEAD + +# Build libgpuarray and run C tests +mkdir build +cd build +cmake .. -DCMAKE_BUILD_TYPE=%GPUARRAY_CONFIG% -G "NMake Makefiles" +nmake +cd .. + +REM Export paths +set LIBDIR=%WORKSPACE%\local +set PATH=%PATH%;%LIBDIR%\lib;C:\lib\cuda\bin + +REM Clean up previous installs (to make sure no old files are left) +rmdir %LIBDIR% /s/q +mkdir %LIBDIR% + +# Test on different devices +(for %%dev in (%DEVICES_CUDA%) do ( + echo "Testing libgpuarray for DEVICE=%%dev" + cd build + set DEVICE=%%dev + make test + cd .. +)) + +(for %%dev in (%DEVICES_OPENCL%) do ( + echo "Testing libgpuarray for DEVICE=%%dev" + cd build + set DEVICE=%%dev + make test + cd .. +)) + +REM Set conda python path +set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\ProgramData\Miniconda2\Library\mingw-w64\bin;C:\ProgramData\Miniconda2\Library\usr\bin;C:\ProgramData\Miniconda2\Library\bin;C:\ProgramData\Miniconda2\Scripts + +REM Build the pygpu modules +python setup.py build_ext --inplace + +# Test it +set test=pygpu +(for %%dev in (%DEVICES_CUDA%) do ( + echo "Testing pygpu for DEVICE=%%dev" + set DEVICE=%%dev + nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests +)) +(for %%dev in (%DEVICES_OPENCL%) do ( + echo "Testing pygpu for DEVICE=%%dev" + set DEVICE=%%dev + nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests +)) From 668f05def16446b2139666b0c8154a4c530691fc Mon Sep 17 00:00:00 2001 From: Pascal Lamblin Date: Tue, 25 Jul 2017 19:33:32 -0400 Subject: [PATCH 414/597] Add flag to implicitly left-pad with broadcastable dimensions --- src/gpuarray/elemwise.h | 5 +++++ src/gpuarray_elemwise.c | 36 +++++++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/gpuarray/elemwise.h b/src/gpuarray/elemwise.h index bef99c2589..173ec0422c 100644 --- a/src/gpuarray/elemwise.h +++ b/src/gpuarray/elemwise.h @@ -156,6 +156,11 @@ GPUARRAY_PUBLIC int GpuElemwise_call(GpuElemwise *ge, void **args, int flags); */ #define GE_NOCOLLAPSE 0x0200 +/** + * Allow implicit left-padding of shape with dimensions of size 1. + */ +#define GE_PADSHAPE 0x0400 + /** * @} */ diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 8921141cff..5411949b20 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -276,14 +276,21 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, GpuArray *a = NULL, *v; unsigned int i, j, p, num_arrays = 0, nd = 0, nnd; int call32 = 1; + unsigned int nd_i = 0; + size_t v_dim_j = 0; /* Go through the list and grab some info */ for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { + nd_i = ((GpuArray *)args[i])->nd; if (num_arrays == 0) - nd = ((GpuArray *)args[i])->nd; - else if (((GpuArray *)args[i])->nd != nd) - return error_fmt(ctx->err, GA_VALUE_ERROR, "Arg %u has differing nd = %u", i, ((GpuArray *)args[i])->nd); + nd = nd_i; + else if (nd_i != nd) { + if (flags & GE_PADSHAPE) + nd = nd_i > nd ? nd_i : nd; + else + return error_fmt(ctx->err, GA_VALUE_ERROR, "Arg %u has differing nd = %u", i, nd_i); + } ++num_arrays; if (a == NULL && is_output(ge->args[i])) a = (GpuArray *)args[i]; @@ -301,7 +308,7 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, return error_sys(ctx->err, "ge_grow"); } - /* Now we know that all array arguments have the same number of + /* Now we know that all array arguments have at most nd dimensions and that the expected output size is the size of a */ /* And copy their initial values in */ @@ -309,7 +316,11 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, p = 0; for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { - memcpy(ge->strides[p], ((GpuArray *)args[i])->strides, nd*sizeof(ssize_t)); + /* Left-pad strides with zero on implicitly broadcasted dimensions */ + memset(ge->strides[p], 0, nd*sizeof(ssize_t)); + nd_i = ((GpuArray *)args[i])->nd; + memcpy((char *)(ge->strides[p]) + (nd - nd_i)*sizeof(ssize_t), + ((GpuArray *)args[i])->strides, nd_i*sizeof(ssize_t)); p++; } } @@ -326,16 +337,23 @@ static int check_basic(GpuElemwise *ge, void **args, int flags, for (i = 0; i < ge->n; i++) { if (is_array(ge->args[i])) { v = (GpuArray *)args[i]; - if (ge->dims[j] != v->dimensions[j]) { + nd_i = v->nd; + /* Pad shape with 1 if needed for implicitly broadcasted dimensions + and shift if needed */ + if (j < nd - nd_i) + v_dim_j = 1; + else + v_dim_j = v->dimensions[j - (nd - nd_i)]; + if (ge->dims[j] != v_dim_j) { /* We can't broadcast outputs */ if (ISCLR(flags, GE_BROADCAST) || is_output(ge->args[i]) || - v->dimensions[j] != 1) { - return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u (expected %" SPREFIX "u got %" SPREFIX "u)", j, i, ge->dims[j], v->dimensions[j]); + v_dim_j != 1) { + return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u (expected %" SPREFIX "u got %" SPREFIX "u)", j, i, ge->dims[j], v_dim_j); } } /* If the dimension is 1 set the strides to 0 regardless since it won't change anything in the non-broadcast case. */ - if (v->dimensions[j] == 1) { + if (v_dim_j == 1) { ge->strides[p][j] = 0; } call32 &= v->offset < ADDR32_MAX; From 21f7e07a90bfd5415e9b1a17a17b08dfb9e719c8 Mon Sep 17 00:00:00 2001 From: Jenkins Date: Wed, 26 Jul 2017 16:11:21 -0400 Subject: [PATCH 415/597] mac pr remove time nosetests --- .jenkins_pr_mac.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.jenkins_pr_mac.sh b/.jenkins_pr_mac.sh index 0c91673696..b927f567d5 100755 --- a/.jenkins_pr_mac.sh +++ b/.jenkins_pr_mac.sh @@ -22,7 +22,7 @@ export CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH} git rev-parse HEAD # Build libgpuarray and run C tests -rm -rf build +rm -rf build lib mkdir build (cd build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} && make) @@ -41,15 +41,15 @@ export DYLD_LIBRARY_PATH=`pwd`/lib:${DYLD_LIBRARY_PATH} export CPLUS_INCLUDE_PATH=`pwd`/src:${CPLUS_INCLUDE_PATH} # Build the pygpu modules -python setup.py build_ext --inplace -L`pwd`/lib -I`pwd`/src +python setup.py build_ext --inplace -I`pwd`/src -L`pwd`/lib # Test it test=pygpu_pr_mac for dev in ${DEVICES_CUDA}; do echo "Testing pygpu for DEVICE=${dev}" - DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests + DEVICE=${dev} nosetests --with-xunit --xunit-file=${test}_${dev}tests.xml pygpu/tests done for dev in ${DEVICES_OPENCL}; do echo "Testing pygpu for DEVICE=${dev}" - DEVICE=${dev} time nosetests --with-xunit --xunit-file=${test}${dev}tests.xml pygpu/tests -e test_blas.py + DEVICE=${dev} nosetests --with-xunit --xunit-file=${test}_${dev}tests.xml pygpu/tests -e test_blas.py done From 1d37464ff57f296f60739898ec514f70de94cb7c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 27 Jul 2017 15:05:03 -0400 Subject: [PATCH 416/597] Don't free memory allocated by the library. --- pygpu/gpuarray.pxd | 2 ++ pygpu/gpuarray.pyx | 17 ++++------------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 2db9e4c270..f65e31b1f2 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -177,6 +177,8 @@ cdef extern from "gpuarray/array.h": int GpuArray_setarray(_GpuArray *v, _GpuArray *a) int GpuArray_reshape(_GpuArray *res, _GpuArray *a, unsigned int nd, const size_t *newdims, ga_order ord, int nocopy) + int GpuArray_reshape_inplace(_GpuArray *a, unsigned int nd, + const size_t *newdims, ga_order ord) int GpuArray_transpose(_GpuArray *res, _GpuArray *a, const unsigned int *new_axes) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 8c9267fe0e..9bfaf3ec3e 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -2137,7 +2137,7 @@ cdef class GpuArray: cdef size_t *newdims cdef unsigned int nd cdef unsigned int i - cdef GpuArray res + cdef int err nd = len(newshape) newdims = calloc(nd, sizeof(size_t)) if newdims == NULL: @@ -2145,20 +2145,11 @@ cdef class GpuArray: try: for i in range(nd): newdims[i] = newshape[i] - res = new_GpuArray(GpuArray, self.context, None) - array_reshape(res, self, nd, newdims, GA_C_ORDER, 1) + err = GpuArray_reshape_inplace(&self.ga, nd, newdims, GA_C_ORDER) + if err != GA_NO_ERROR: + raise get_exc(err), GpuArray_error(&self.ga, err) finally: free(newdims) - # This is safe becase the reshape above is a nocopy one - free(self.ga.dimensions) - free(self.ga.strides) - self.ga.dimensions = res.ga.dimensions - self.ga.strides = res.ga.strides - self.ga.nd = res.ga.nd - res.ga.dimensions = NULL - res.ga.strides = NULL - res.ga.nd = 0 - array_clear(res) property T: def __get__(self): From 7c9999ddb6590a6478e039c4dbd72af7caceea16 Mon Sep 17 00:00:00 2001 From: Simon Lefrancois Date: Thu, 27 Jul 2017 16:09:29 -0400 Subject: [PATCH 417/597] correct win pr --- .jenkins_pr_win.bat | 70 +++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/.jenkins_pr_win.bat b/.jenkins_pr_win.bat index cbfd9bf130..0907a28ad2 100644 --- a/.jenkins_pr_win.bat +++ b/.jenkins_pr_win.bat @@ -1,61 +1,43 @@ -REM Set path for conda python and cmake -set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\Program Files\CMake\bin +REM Set path for cuda, conda python and cmake +set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\Program Files\CMake\bin;C:\lib\cuda\bin -# Can also set to "Debug", "Release" to go faster +REM Can also set to "Debug", "Release" to go faster set GPUARRAY_CONFIG="Release" -# Set these to " " to disable (empty doesn't work) -set DEVICES_CUDA="cuda" # for multiple devices use "cuda0 cuda1" -set DEVICES_OPENCL="" +REM Use spaces to seperate devices +set DEVICES_CUDA=cuda +set DEVICES_OPENCL= git rev-parse HEAD -# Build libgpuarray and run C tests +REM Clean up previous installs (to make sure no old files are left) +rmdir %WORKSPACE%\lib /s/q +mkdir %WORKSPACE%\lib +rmdir build /s/q mkdir build + +REM Build libgpuarray and run C tests cd build cmake .. -DCMAKE_BUILD_TYPE=%GPUARRAY_CONFIG% -G "NMake Makefiles" nmake cd .. -REM Export paths -set LIBDIR=%WORKSPACE%\local -set PATH=%PATH%;%LIBDIR%\lib;C:\lib\cuda\bin +set PATH=%PATH%;%WORKSPACE%\lib -REM Clean up previous installs (to make sure no old files are left) -rmdir %LIBDIR% /s/q -mkdir %LIBDIR% - -# Test on different devices -(for %%dev in (%DEVICES_CUDA%) do ( - echo "Testing libgpuarray for DEVICE=%%dev" - cd build - set DEVICE=%%dev - make test - cd .. -)) - -(for %%dev in (%DEVICES_OPENCL%) do ( - echo "Testing libgpuarray for DEVICE=%%dev" - cd build - set DEVICE=%%dev - make test - cd .. -)) - -REM Set conda python path -set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\ProgramData\Miniconda2\Library\mingw-w64\bin;C:\ProgramData\Miniconda2\Library\usr\bin;C:\ProgramData\Miniconda2\Library\bin;C:\ProgramData\Miniconda2\Scripts +REM Add conda gcc toolchain path +set PATH=%PATH%;C:\ProgramData\Miniconda2\Library\mingw-w64\bin;C:\ProgramData\Miniconda2\Library\usr\bin;C:\ProgramData\Miniconda2\Library\bin;C:\ProgramData\Miniconda2\Scripts REM Build the pygpu modules python setup.py build_ext --inplace -# Test it +REM Test pygpu set test=pygpu -(for %%dev in (%DEVICES_CUDA%) do ( - echo "Testing pygpu for DEVICE=%%dev" - set DEVICE=%%dev - nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests -)) -(for %%dev in (%DEVICES_OPENCL%) do ( - echo "Testing pygpu for DEVICE=%%dev" - set DEVICE=%%dev - nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests -)) +for %%d in (%DEVICES_CUDA%) do ( + echo "Testing pygpu for DEVICE=%%d" + set DEVICE=%%d + nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests +) +for %%d in (%DEVICES_OPENCL%) do ( + echo "Testing pygpu for DEVICE=%%d" + set DEVICE=%%d + nosetests --with-xunit --xunit-file=%test%_%DEVICE%_tests.xml pygpu\tests -e test_blas.py +) From e5b21870be0387b5826bb33fb0a6e115a208cadf Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 31 Jul 2017 16:11:54 -0400 Subject: [PATCH 418/597] Add minimal test for padshape --- tests/check_elemwise.c | 69 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index 2c8f75092d..9656bf870a 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -321,6 +321,7 @@ START_TEST(test_basic_offset) { /* Simulate indexing */ a.offset = 12; a.dimensions[1] = 3; + GpuArray_fix_flags(&a); ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); @@ -563,6 +564,73 @@ START_TEST(test_basic_broadcast) { } END_TEST +START_TEST(test_basic_padshape) { + GpuArray a; + GpuArray b; + GpuArray c; + + GpuElemwise *ge; + + static const uint32_t data1[3] = {1, 2, 3}; + static const uint32_t data2[2] = {4, 5}; + uint32_t data3[6] = {0}; + + size_t dims[2]; + + gpuelemwise_arg args[3] = {{0}}; + void *rargs[3]; + + dims[0] = 3; + + ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 1, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1))); + + dims[0] = 2; + dims[1] = 1; + + ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER)); + ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2))); + + dims[0] = 2; + dims[1] = 3; + + ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER)); + + args[0].name = "a"; + args[0].typecode = GA_UINT; + args[0].flags = GE_READ; + + args[1].name = "b"; + args[1].typecode = GA_UINT; + args[1].flags = GE_READ; + + args[2].name = "c"; + args[2].typecode = GA_UINT; + args[2].flags = GE_WRITE; + + ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0); + + ck_assert_ptr_ne(ge, NULL); + + rargs[0] = &a; + rargs[1] = &b; + rargs[2] = &c; + + ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE), GA_VALUE_ERROR); + + ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST | GE_PADSHAPE)); + + ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c)); + + ck_assert_int_eq(data3[0], 5); + ck_assert_int_eq(data3[1], 6); + ck_assert_int_eq(data3[2], 7); + ck_assert_int_eq(data3[3], 6); + ck_assert_int_eq(data3[4], 7); + ck_assert_int_eq(data3[5], 8); +} +END_TEST + START_TEST(test_basic_collapse) { GpuArray a; GpuArray b; @@ -755,6 +823,7 @@ Suite *get_suite(void) { tcase_add_test(tc, test_basic_offset); tcase_add_test(tc, test_basic_remove1); tcase_add_test(tc, test_basic_broadcast); + tcase_add_test(tc, test_basic_padshape); tcase_add_test(tc, test_basic_collapse); tcase_add_test(tc, test_basic_neg_strides); tcase_add_test(tc, test_basic_0); From 2c6374e9aac3945901005ac03ac3a4b199acef93 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 31 Jul 2017 16:17:43 -0400 Subject: [PATCH 419/597] Add support for the padshape keyword in python. --- pygpu/_elemwise.pyx | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pygpu/_elemwise.pyx b/pygpu/_elemwise.pyx index 1875503c0c..713e241e8a 100644 --- a/pygpu/_elemwise.pyx +++ b/pygpu/_elemwise.pyx @@ -40,6 +40,7 @@ cdef extern from "gpuarray/elemwise.h": cdef int GE_BROADCAST cdef int GE_NOCOLLAPSE + cdef int GE_PADSHAPE cdef class arg: @@ -193,9 +194,19 @@ cdef class GpuElemwise: def __call__(self, *args, **kwargs): cdef unsigned int i cdef int err + cdef int flags + + flags = 0 + if kwargs.pop('broadcast', True): + flags |= GE_BROADCAST + if kwargs.pop('padshape', True): + flags |= GE_PADSHAPE + + if len(kwargs) != 0: + raise TypeError("Unknown keyword argument: %s" % list(kwargs.keys())[0]) for i, arg in enumerate(args): self._setarg(i, arg) - err = GpuElemwise_call(self.ge, self.callbuf, GE_BROADCAST if kwargs.get('broadcast', True) else 0) + err = GpuElemwise_call(self.ge, self.callbuf, flags) if err != GA_NO_ERROR: raise get_exc(err)("Could not call GpuElemwise") From 9c5811bdd3168cadf373d205fb099c755e5f183e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 1 Aug 2017 12:15:51 -0400 Subject: [PATCH 420/597] Stop comparing to None or Ellipsis with '==' (or equivalent) --- pygpu/gpuarray.pyx | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 8c9267fe0e..05a09ce818 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -40,6 +40,15 @@ cdef bytes _s(s): return s raise TypeError("Expected a string") +cdef size_t countis(l, object val): + cdef size_t count + cdef size_t i + count = 0 + for i in range(len(l)): + if l[i] is val: + count += 1 + return count + def cl_wrap_ctx(size_t ptr): """ cl_wrap_ctx(ptr) @@ -244,7 +253,7 @@ cdef int strides_ok(GpuArray a, strides): if a.ga.dimensions[i] == 0: return 1 - max_axis_offset = strides[i] * (a.ga.dimensions[i] - 1) + max_axis_offset = (strides[i]) * (a.ga.dimensions[i] - 1) if max_axis_offset > 0: if upper + max_axis_offset > size: return 0 @@ -1951,7 +1960,7 @@ cdef class GpuArray: key = tuple(key) # Need to massage Ellipsis here, to avoid packing it into a tuple. - if key.count(Ellipsis) > 1: + if countis(key, Ellipsis) > 1: raise IndexError, "cannot use more than one Ellipsis" # The following code replaces an Ellipsis found in the key by @@ -1966,7 +1975,7 @@ cdef class GpuArray: else: # Need number of axes minus missing dimensions extra slice(None) # objects, not counting None entries and the Ellipsis itself - num_slcs = self.ga.nd - (len(key) - key.count(None) - 1) + num_slcs = self.ga.nd - (len(key) - countis(key, None) - 1) fill_slices = (slice(None),) * num_slcs key = key[:ell_idx] + fill_slices + key[ell_idx + 1:] @@ -1983,7 +1992,7 @@ cdef class GpuArray: # Slice into array, then reshape, accommodating for None entries in key sliced = self.__cgetitem__(getitem_idcs) - if key.count(None) == 0: + if countis(key, None) == 0: # Avoid unnecessary reshaping if there was no None return sliced else: @@ -2085,7 +2094,7 @@ cdef class GpuArray: idx = tuple(idx) - if idx.count(Ellipsis) > 1: + if countis(idx, Ellipsis) > 1: raise IndexError, "cannot use more than one Ellipsis" # Remove None entries, they should be ignored (as in Numpy) @@ -2434,7 +2443,7 @@ cdef class GpuKernel: """ __call__(*args, n=None, gs=None, ls=None, shared=0) """ - if n == None and (ls == None or gs == None): + if n is None and (ls is None or gs is None): raise ValueError, "Must specify size (n) or both gs and ls" self.do_call(n, gs, ls, args, shared) From 8a66700c38dac47539688a57bb767386be228887 Mon Sep 17 00:00:00 2001 From: Simon Lefrancois Date: Wed, 2 Aug 2017 16:02:27 -0400 Subject: [PATCH 421/597] win use conda git --- .jenkins_pr_win.bat | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.jenkins_pr_win.bat b/.jenkins_pr_win.bat index 0907a28ad2..4a3a4477d0 100644 --- a/.jenkins_pr_win.bat +++ b/.jenkins_pr_win.bat @@ -1,5 +1,7 @@ REM Set path for cuda, conda python and cmake -set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\Program Files\CMake\bin;C:\lib\cuda\bin +REM Set conda python, cudnn, cmake path +set PATH=%PATH%;C:\ProgramData\Miniconda2;C:\ProgramData\Miniconda2\Library\usr\bin;C:\ProgramData\Miniconda2\Library\bin;C:\ProgramData\Miniconda2\Scripts +set PATH=%PATH%;%CUDNNPATH%\bin;C:\Program Files\CMake\bin REM Can also set to "Debug", "Release" to go faster set GPUARRAY_CONFIG="Release" @@ -24,7 +26,7 @@ cd .. set PATH=%PATH%;%WORKSPACE%\lib REM Add conda gcc toolchain path -set PATH=%PATH%;C:\ProgramData\Miniconda2\Library\mingw-w64\bin;C:\ProgramData\Miniconda2\Library\usr\bin;C:\ProgramData\Miniconda2\Library\bin;C:\ProgramData\Miniconda2\Scripts +set PATH=%PATH%;C:\ProgramData\Miniconda2\Library\mingw-w64\bin REM Build the pygpu modules python setup.py build_ext --inplace From e1eb2c4226a18680acb612ba08ff4d6981b5ff67 Mon Sep 17 00:00:00 2001 From: Simon Lefrancois Date: Wed, 2 Aug 2017 16:02:40 -0400 Subject: [PATCH 422/597] update windows install doc --- doc/installation.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 880decfa47..6b81b4671d 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -170,13 +170,14 @@ If you prefer a command-line approach, cmake is available as a console program with the same options as the Unix variant. You can select the nmake builder by passing ``-G "NMake Makefiles"`` to cmake. -Since there is no standard install location on Windows, there is no -install step. It is up to you to copy the headers and libraries to an -appropriate place. - -If you don't have Visual Studio installed, you can get the free -Express version from `here `_ in the -downloads section (select the "for windows" edition). +There is no standard install location on Windows, but you can specify a custom +location by passing ``-DCMAKE_INSTALL_PREFIX=%LIBDIR%`` to cmake. You can then +install using ``cmake --build . --target install`` after ``nmake``. + +If you don't have Visual Studio installed, you can get the free `Visual Studio +Community edition `_, +which has compilation tools for python 3.5 and up. For python 2.7, install +`Microsoft Visual C++ Compiler for Python 2.7 `_. .. warning:: While you may get the library to compile using cygwin, this is not From b4d2e194f1c9c3551eea37f3408886d47d4c2a47 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 16:38:26 -0400 Subject: [PATCH 423/597] Remove GA_POINTER --- src/gen_types.py | 1 - src/gpuarray/types.h | 1 - src/gpuarray_buffer_opencl.c | 2 -- 3 files changed, 4 deletions(-) diff --git a/src/gen_types.py b/src/gen_types.py index 3e14c9a4f2..0e87fe23f3 100644 --- a/src/gen_types.py +++ b/src/gen_types.py @@ -165,7 +165,6 @@ def add_type(name, sz): * List of all built-in types. */ enum GPUARRAY_TYPES { - GA_POINTER = -2, GA_BUFFER = -1, % for i, v in sorted(TYPEMAP.items()): GA_${v[1].upper()} = ${i}, diff --git a/src/gpuarray/types.h b/src/gpuarray/types.h index afd0df16e4..2fac29bb37 100644 --- a/src/gpuarray/types.h +++ b/src/gpuarray/types.h @@ -43,7 +43,6 @@ typedef struct _gpuarray_type { * List of all built-in types. */ enum GPUARRAY_TYPES { - GA_POINTER = -2, GA_BUFFER = -1, GA_BOOL = 0, GA_BYTE = 1, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index edcba5eee7..5f8db3857e 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -990,8 +990,6 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) { cl_ulong temp; cl_long stemp; switch (k->types[i]) { - case GA_POINTER: - return error_set(ctx->err, GA_DEVSUP_ERROR, "Cannot set raw pointers as kernel arguments"); case GA_BUFFER: btmp = (gpudata *)a; CL_CHECK(ctx->err, clSetKernelArg(k->k, i, sizeof(cl_mem), &btmp->buf)); From 9f6cebd7895b8c06392af3d23aac9fbac73dcda4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 16:45:03 -0400 Subject: [PATCH 424/597] Remove GA_CTX_PROP_MAXLSIZE and GA_CTX_PROP_MAXGSIZE --- src/gpuarray/buffer.h | 15 ++------------- src/gpuarray_buffer_cuda.c | 8 -------- src/gpuarray_buffer_opencl.c | 25 ------------------------- src/gpuarray_kernel.c | 2 +- src/gpuarray_reduction.c | 2 +- 5 files changed, 4 insertions(+), 48 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index be314b3e8e..b23cb2a9f3 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -550,13 +550,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); */ #define GA_CTX_PROP_DEVNAME 1 -/** - * Get the maximum block size (also known as local size) for a kernel - * call in the context. - * - * Type: `size_t` - */ -#define GA_CTX_PROP_MAXLSIZE 2 +/* UNUSED: 2 */ /** * Get the local memory size available for a call in the context. @@ -576,12 +570,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); */ #define GA_CTX_PROP_NUMPROCS 4 -/** - * Get the maximum group size for a kernel call in this context. - * - * Type: `size_t` - */ -#define GA_CTX_PROP_MAXGSIZE 5 +/* UNUSED: 5 */ /** * Get the vector of blas ops for the context. diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 071ec9700d..9f617e73d2 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1676,10 +1676,6 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, *((size_t *)res) = largest_size(ctx); return GA_NO_ERROR; - case GA_CTX_PROP_MAXLSIZE: - GETPROP(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, size_t); - return GA_NO_ERROR; - case GA_CTX_PROP_LMEMSIZE: GETPROP(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, size_t); return GA_NO_ERROR; @@ -1688,10 +1684,6 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, GETPROP(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, unsigned int); return GA_NO_ERROR; - case GA_CTX_PROP_MAXGSIZE: - GETPROP(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, size_t); - return GA_NO_ERROR; - case GA_CTX_PROP_BLAS_OPS: GA_CHECK(load_libcublas(major, minor, ctx->err)); *((gpuarray_blas_ops **)res) = &cublas_ops; diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 5f8db3857e..4a5c14e81e 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1182,14 +1182,6 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, /* For the moment, PCI Bus ID is not supported for OpenCL. */ return error_set(ctx->err, GA_DEVSUP_ERROR, "Can't get PCI bus ID on OpenCL"); - case GA_CTX_PROP_MAXLSIZE: - CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, - sizeof(id), &id, NULL)); - CL_GET_PROP(ctx->err, clGetDeviceInfo, id, CL_DEVICE_MAX_WORK_ITEM_SIZES, psz); - *((size_t *)res) = psz[0]; - free(psz); - return GA_NO_ERROR; - case GA_CTX_PROP_LMEMSIZE: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); @@ -1206,23 +1198,6 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, *((unsigned int *)res) = ui; return GA_NO_ERROR; - case GA_CTX_PROP_MAXGSIZE: - CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, - sizeof(id), &id, NULL)); - CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_ADDRESS_BITS, sizeof(ui), - &ui, NULL)); - CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_GROUP_SIZE, - sizeof(sz), &sz, NULL)); - if (ui == 32) { - sz = 4294967295UL/sz; - } else if (ui == 64) { - sz = 18446744073709551615ULL/sz; - } else { - assert(0 && "This should not be reached!"); - } - *((size_t *)res) = sz; - return GA_NO_ERROR; - case GA_CTX_PROP_BLAS_OPS: { int e; diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c index 88b1cbaadf..0779afea69 100644 --- a/src/gpuarray_kernel.c +++ b/src/gpuarray_kernel.c @@ -54,7 +54,7 @@ int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) { err = gpukernel_property(k->k, GA_CTX_PROP_NUMPROCS, &numprocs); if (err != GA_NO_ERROR) return err; - err = gpukernel_property(k->k, GA_CTX_PROP_MAXGSIZE, &max_g); + err = gpukernel_property(k->k, GA_CTX_PROP_MAXGSIZE0, &max_g); if (err != GA_NO_ERROR) return err; diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index b1a185e3b7..af8c78ff0e 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -697,8 +697,8 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE, &maxG); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); + maxG = maxG0; gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); From 6a9497535f84bce26528558bd9438e875623dc66 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 17:00:17 -0400 Subject: [PATCH 425/597] Remove BLAS_OPS and COMM_OPS --- pygpu/gpuarray.pyx | 2 +- pygpu/reduction.py | 4 ++-- src/gpuarray/buffer.h | 18 ++---------------- src/gpuarray_buffer.c | 5 ----- src/gpuarray_buffer_cuda.c | 32 +++++++++++++++---------------- src/gpuarray_buffer_opencl.c | 37 ++++++++++++++---------------------- 6 files changed, 35 insertions(+), 63 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index e1dad34a90..7da0466d1f 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -2307,7 +2307,7 @@ cdef class GpuKernel: k(param1, param2, gs=gs, ls=ls) If you choose to use this interface, make sure to stay within the - limits of `k.maxlsize` and `ctx.maxgsize` or the call will fail. + limits of `k.maxlsize` or the call will fail. Parameters ---------- diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 7edd5417c9..2c16508ab6 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -187,7 +187,7 @@ def __init__(self, context, dtype_out, neutral, reduce_expr, redux, self.init_local_size = min(context.lmemsize // self.out_arg.dtype.itemsize, - context.maxlsize) + context.maxlsize0) # this is to prep the cache if init_nd is not None: @@ -253,7 +253,7 @@ def __call__(self, *args, **kwargs): if gs == 0: gs = 1 n /= gs - if gs > self.context.maxgsize: + if gs > self.context.maxgsize0: raise ValueError("Array too big to be reduced along the " "selected axes") diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index b23cb2a9f3..db1b09b0d2 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -572,16 +572,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); /* UNUSED: 5 */ -/** - * Get the vector of blas ops for the context. - * - * This may differ from one context to the other in the same backend - * depending of the availability and performance of various BLAS - * libraries. - * - * Type: `const gpuarray_blas_ops *` - */ -#define GA_CTX_PROP_BLAS_OPS 6 +/* UNUSED: 6 */ /** * Get the compatibility ID for the binaries generated with this context. @@ -669,12 +660,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); */ #define GA_CTX_PROP_MAXLSIZE2 17 -/** - * Get the vector of collective ops for the context. - * - * Type: `const gpuarray_comm_ops *` - */ -#define GA_CTX_PROP_COMM_OPS 18 +/* UNUSED: 18 */ /** * Get the device PCI Bus ID for the context. diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 42531de380..5dc74c7ad6 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -44,11 +44,6 @@ gpucontext *gpucontext_init(const char *name, int dev, int flags, int *ret) { res = ops->buffer_init(dev, flags); if (res == NULL) FAIL(NULL, global_err); res->ops = ops; - if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, (void *)&res->blas_ops) != GA_NO_ERROR) - res->blas_ops = NULL; - res->blas_handle = NULL; - if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, (void *)&res->comm_ops) != GA_NO_ERROR) - res->comm_ops = NULL; res->extcopy_cache = NULL; return res; } diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 9f617e73d2..a8cce9039a 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -37,6 +37,9 @@ STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcme */ #define FRAG_SIZE (64) +extern gpuarray_blas_ops cublas_ops; +extern gpuarray_comm_ops nccl_ops; + const gpuarray_buffer_ops cuda_ops; static void cuda_freekernel(gpukernel *); @@ -334,7 +337,6 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { static void deallocate(gpudata *); static void cuda_free_ctx(cuda_context *ctx) { - gpuarray_blas_ops *blas_ops; gpudata *next, *curr; CUdevice dev; @@ -343,9 +345,7 @@ static void cuda_free_ctx(cuda_context *ctx) { if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { - cuda_property((gpucontext *)ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, - &blas_ops); - blas_ops->teardown((gpucontext *)ctx); + ctx->blas_ops->teardown((gpucontext *)ctx); } cuMemFreeHost((void *)ctx->errbuf->ptr); deallocate(ctx->errbuf); @@ -556,6 +556,17 @@ static cuda_context *do_init(CUdevice dev, int flags, error *e) { error_set(e, global_err->code, global_err->msg); return NULL; } + + res->blas_handle = NULL; + /* If we can't load cublas, then we have no blas */ + if (!load_libcublas(major, minor, res->err)) { + res->blas_ops = &cublas_ops; + } else { + res->blas_ops = NULL; + } + + res->comm_ops = &nccl_ops; + /* Don't leave the context on the thread stack */ cuCtxPopCurrent(NULL); @@ -592,6 +603,7 @@ static gpucontext *cuda_init(int ord, int flags) { return (gpucontext *)do_init(dev, flags, global_err); } } + static void cuda_deinit(gpucontext *c) { cuda_free_ctx((cuda_context *)c); } @@ -1614,9 +1626,6 @@ static int cuda_transfer(gpudata *dst, size_t dstoff, return GA_NO_ERROR; } -extern gpuarray_blas_ops cublas_ops; -extern gpuarray_comm_ops nccl_ops; - static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cuda_context *ctx = NULL; @@ -1684,15 +1693,6 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, GETPROP(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, unsigned int); return GA_NO_ERROR; - case GA_CTX_PROP_BLAS_OPS: - GA_CHECK(load_libcublas(major, minor, ctx->err)); - *((gpuarray_blas_ops **)res) = &cublas_ops; - return GA_NO_ERROR; - - case GA_CTX_PROP_COMM_OPS: - *((gpuarray_comm_ops**)res) = &nccl_ops; - return GA_NO_ERROR; - case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 4a5c14e81e..ee85aa2fed 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -18,6 +18,9 @@ #define _unused(x) ((void)x) #define SSIZE_MIN (-(SSIZE_MAX-1)) +extern gpuarray_blas_ops clblas_ops; +extern gpuarray_blas_ops clblast_ops; + const gpuarray_buffer_ops opencl_ops; static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r); @@ -195,6 +198,17 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { if (res->preamble == NULL) goto fail; + res->blas_handle = NULL; + if (load_libclblas(res->err) == GA_NO_ERROR) { + res->blas_ops = &clblas_ops; + } else if (load_libclblast(res->err) == GA_NO_ERROR) { + res->blas_ops = &clblast_ops; + } else { + res->blas_ops = NULL; + } + + res->comm_ops = NULL; + return res; fail: @@ -1137,9 +1151,6 @@ static int cl_transfer(gpudata *dst, size_t dstoff, return error_set(dst->ctx->err, GA_UNSUPPORTED_ERROR, "Operation not supported"); } -extern gpuarray_blas_ops clblas_ops; -extern gpuarray_blas_ops clblast_ops; - static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cl_ctx *ctx = NULL; @@ -1198,26 +1209,6 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, *((unsigned int *)res) = ui; return GA_NO_ERROR; - case GA_CTX_PROP_BLAS_OPS: - { - int e; - if ((e = load_libclblas(ctx->err)) == GA_NO_ERROR) { - *((gpuarray_blas_ops **)res) = &clblas_ops; - return e; - } - if ((e = load_libclblast(ctx->err)) == GA_NO_ERROR) { - *((gpuarray_blas_ops **)res) = &clblast_ops; - return e; - } - return e; - } - - case GA_CTX_PROP_COMM_OPS: - // TODO Complete in the future whenif a multi-gpu collectives API for - // opencl appears - *((void **)res) = NULL; - return error_set(ctx->err, GA_DEVSUP_ERROR, "Collectives operations not supported on OpenCL"); - case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; From 7c61742a53847c7d2b8f930232b10fd3ee928adb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 17:09:55 -0400 Subject: [PATCH 426/597] Rename PCIBUSID to UNIQUE_ID --- src/gpuarray/buffer.h | 4 ++-- src/gpuarray_buffer_cuda.c | 2 +- src/gpuarray_buffer_opencl.c | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index db1b09b0d2..dd1d131d2d 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -663,11 +663,11 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); /* UNUSED: 18 */ /** - * Get the device PCI Bus ID for the context. + * Get a unique ID for the device behind the context. * * Type: `char [16]` */ -#define GA_CTX_PROP_PCIBUSID 19 +#define GA_CTX_PROP_UNIQUE_ID 19 /** * Get the largest single block of memory that can be allocted. diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index a8cce9039a..4e3cf42ce3 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1674,7 +1674,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, cuda_exit(ctx); return GA_NO_ERROR; - case GA_CTX_PROP_PCIBUSID: + case GA_CTX_PROP_UNIQUE_ID: cuda_enter(ctx); CUDA_EXIT_ON_ERROR(ctx, cuCtxGetDevice(&id)); CUDA_EXIT_ON_ERROR(ctx, cuDeviceGetPCIBusId((char *)res, 13, id)); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index ee85aa2fed..d41822aaf5 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1189,9 +1189,8 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, NULL)); return GA_NO_ERROR; - case GA_CTX_PROP_PCIBUSID: - /* For the moment, PCI Bus ID is not supported for OpenCL. */ - return error_set(ctx->err, GA_DEVSUP_ERROR, "Can't get PCI bus ID on OpenCL"); + case GA_CTX_PROP_UNIQUE_ID: + return error_set(ctx->err, GA_DEVSUP_ERROR, "Can't get unique ID on OpenCL"); case GA_CTX_PROP_LMEMSIZE: CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, From a3be798988dcc13c29e621235341b0c0589d89a6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 17:16:46 -0400 Subject: [PATCH 427/597] Prevent the creation of GpuArrays with GA_SIZE or GA_SSIZE as the type. --- src/gpuarray_array.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 4b4a76e9a8..6cf7daee58 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -103,6 +103,9 @@ int GpuArray_empty(GpuArray *a, gpucontext *ctx, int typecode, unsigned int i; int res = GA_NO_ERROR; + if (typecode == GA_SIZE || typecode == GA_SSIZE) + return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type"); + if (ord == GA_ANY_ORDER) ord = GA_C_ORDER; @@ -189,8 +192,10 @@ int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable) { gpucontext *ctx = gpudata_context(data); - if (gpuarray_get_type(typecode)->typecode != typecode) - return error_set(ctx->err, GA_VALUE_ERROR, "typecode mismatch"); + + if (typecode == GA_SIZE || typecode == GA_SSIZE) + return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type"); + assert(data != NULL); a->data = data; gpudata_retain(a->data); @@ -222,6 +227,9 @@ int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf, int err; unsigned int i; + if (typecode == GA_SIZE || typecode == GA_SSIZE) + return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type"); + for (i = 0; i < nd; i++) { if (dims[i] == 0) { size = 0; From c445656009680c7fb906e535edf74d786e07a358 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 17:27:19 -0400 Subject: [PATCH 428/597] Catch up in pygpu. --- pygpu/gpuarray.pxd | 4 +--- pygpu/gpuarray.pyx | 18 ++---------------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index f65e31b1f2..0b885671c7 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -90,11 +90,9 @@ cdef extern from "gpuarray/buffer.h": int GA_CTX_DISABLE_ALLOCATION_CACHE int GA_CTX_PROP_DEVNAME - int GA_CTX_PROP_PCIBUSID - int GA_CTX_PROP_MAXLSIZE + int GA_CTX_PROP_UNIQUE_ID int GA_CTX_PROP_LMEMSIZE int GA_CTX_PROP_NUMPROCS - int GA_CTX_PROP_MAXGSIZE int GA_CTX_PROP_BIN_ID int GA_CTX_PROP_TOTAL_GMEM int GA_CTX_PROP_FREE_GMEM diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 7da0466d1f..109425f227 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1091,21 +1091,14 @@ cdef class GpuContext: ctx_property(self, GA_CTX_PROP_DEVNAME, tmp) return tmp.decode('ascii') - property pcibusid: + property unique_id: "Device PCI Bus ID for this context" def __get__(self): cdef char tmp[16] - ctx_property(self, GA_CTX_PROP_PCIBUSID, tmp) + ctx_property(self, GA_CTX_PROP_UNIQUE_ID, tmp) return tmp.decode('ascii') - property maxlsize: - "Maximum size of thread block (local size) for this context" - def __get__(self): - cdef size_t res - ctx_property(self, GA_CTX_PROP_MAXLSIZE, &res) - return res - property lmemsize: "Size of the local (shared) memory, in bytes, for this context" def __get__(self): @@ -1120,13 +1113,6 @@ cdef class GpuContext: ctx_property(self, GA_CTX_PROP_NUMPROCS, &res) return res - property maxgsize: - "Maximum group size for kernel calls" - def __get__(self): - cdef size_t res - ctx_property(self, GA_CTX_PROP_MAXGSIZE, &res) - return res - property bin_id: "Binary compatibility id" def __get__(self): From 039eee28d69c40b63a00e269aac8ced6ffd4ac55 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 17:46:25 -0400 Subject: [PATCH 429/597] Reduce the warning level since -Wextra has too many false positives. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d721e46f85..8b887fb844 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") if(MSVC) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3") else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Wno-unused-parameter -Werror=format-security -Wdeclaration-after-statement -std=gnu89") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unused-parameter -Werror=format-security -Wdeclaration-after-statement -std=gnu89") endif() enable_testing() From 3cc15688be905d6f4bbbf1767b3b696068c2b963 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 2 Aug 2017 17:48:58 -0400 Subject: [PATCH 430/597] Swap order of include directories so that local headers go first. --- tests/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ada91c409a..97bf15a307 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -48,6 +48,9 @@ endif() if(CHECK_FOUND) enable_testing() +include_directories("${CMAKE_SOURCE_DIR}/src") +include_directories("${CMAKE_CURRENT_SOURCE_DIR}") + include_directories(${CHECK_INCLUDE_DIRS}) link_directories(${CHECK_LIBRARY_DIRS}) @@ -59,9 +62,6 @@ foreach(flag ${CHECK_LDFLAGS_OTHER}) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}") endforeach() -include_directories("${CMAKE_SOURCE_DIR}/src") -include_directories("${CMAKE_CURRENT_SOURCE_DIR}") - add_executable(check_types main.c check_types.c) target_link_libraries(check_types ${CHECK_LIBRARIES} gpuarray) add_test(test_types "${CMAKE_CURRENT_BINARY_DIR}/check_types") From 14ba413de7e1bc542c91dc944bf5a709d44dcacd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 3 Aug 2017 14:42:36 -0400 Subject: [PATCH 431/597] Add a better way to pass parameters at context creation time. --- src/gpuarray/buffer.h | 88 ++++++++++++------------------------ src/gpuarray_buffer.c | 85 ++++++++++++++++++++++++++++++---- src/gpuarray_buffer_cuda.c | 62 ++++++++++++++++--------- src/gpuarray_buffer_opencl.c | 13 +++--- src/private.h | 15 +++++- src/private_cuda.h | 4 +- src/private_opencl.h | 2 +- 7 files changed, 171 insertions(+), 98 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index dd1d131d2d..82523dab28 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -71,77 +71,47 @@ GPUARRAY_PUBLIC int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount); +typedef struct _gpucontext_props gpucontext_props; +GPUARRAY_PUBLIC int gpucontext_props_new(gpucontext_props **res); -/** - * Create a context on the specified device. - * - * \warning This function is not thread-safe. - * - * \param name the backend name. - * \param dev the device number. The precise meaning of the device - * number is backend-dependent - * \param flags see \ref context_flags "Context flags" - * \param ret error return location. Will be ignored if set to NULL. - * - * \returns An opaque pointer to the created context or NULL if an - * error occured. - */ -GPUARRAY_PUBLIC gpucontext *gpucontext_init(const char *name, int dev, - int flags, int *ret); +GPUARRAY_PUBLIC int gpucontext_props_cuda_dev(gpucontext_props *p, int devno); -/** - * \defgroup context_flags Context flags - * @{ - */ +GPUARRAY_PUBLIC int gpucontext_props_opencl_dev(gpucontext_props *p, + int platno, int devno); -/** - * Let the backend decide on optimal parameters, using backend-defined - * heuristics and defaults. - * - * This is the default (0) value. - */ -#define GA_CTX_DEFAULT 0x00 +#define GA_CTX_SCHED_AUTO 0 +#define GA_CTX_SCHED_SINGLE 1 +#define GA_CTX_SCHED_MULTI 2 +GPUARRAY_PUBLIC int gpucontext_props_sched(gpucontext_props *p, int sched); -/** - * Optimize parameters for multi-thread performance. - * - * May decrease overall performance in single-thread scenarios. - */ -#define GA_CTX_MULTI_THREAD 0x01 +GPUARRAY_PUBLIC int gpucontext_props_set_single_stream(gpucontext_props *p); -/** - * Optimize parameters for single-thread performance. - * - * May decrease overall performace in multithread scenarios. - */ -#define GA_CTX_SINGLE_THREAD 0x02 +GPUARRAY_PUBLIC int gpucontext_props_kernel_cache(gpucontext_props *p, + const char *path); + +GPUARRAY_PUBLIC int gpucontext_props_alloc_cache(gpucontext_props *p, + size_t initial, size_t max); + +GPUARRAY_PUBLIC void gpucontext_props_del(gpucontext_props *p); + +/* TODO: add new props */ /** - * Allocate a single stream per context, performing all operations in order. + * Create a context on the specified device. * - * This will remove any attempt at exploiting parallelism in the - * underlying device by performing unrelated operations concurrently - * and/or out of order. + * \warning This function is not thread-safe. * - * This can help performance by removing the small cost paid for each - * operation to keep everything coherent in the face of parallelism. - * It can also hinder performance by not exploiting concurrency. - */ -#define GA_CTX_SINGLE_STREAM 0x4 - -/** - * Disable allocations cache (if any). + * \param res a pointer to a location that will be allocated + * \param name the backend name. + * \param dev the device number. The precise meaning of the device + * number is backend-dependent + * \param props a properties object for the context. Can be NULL for defaults. * - * This will usually decrease performance by quite a bit, but will - * enable better debugging of kernels that perform out of bounds - * access. - */ -#define GA_CTX_DISABLE_ALLOCATION_CACHE 0x10 - -/** - * @} + * \returns GA_NO_ERROR or an error code if an error occurred. */ +GPUARRAY_PUBLIC int gpucontext_init(gpucontext **res, const char *name, + gpucontext_props *props); /** * Dereference a context. diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 5dc74c7ad6..84ea4b59a9 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -37,15 +37,84 @@ int gpu_get_device_count(const char* name, unsigned int platform, return ops->get_device_count(platform, devcount); } -gpucontext *gpucontext_init(const char *name, int dev, int flags, int *ret) { - gpucontext *res; +int gpucontext_props_new(gpucontext_props **res) { + gpucontext_props *r = calloc(1, sizeof(gpucontext_props)); + if (r == NULL) return error_sys(global_err, "calloc"); + r->dev = -1; + r->sched = GA_CTX_SCHED_AUTO; + r->flags = 0; + r->kernel_cache_path = NULL; + r->initial_cache_size = 0; + r->max_cache_size = (size_t)-1; + *res = r; + return GA_NO_ERROR; +} + +int gpucontext_props_cuda_dev(gpucontext_props *p, int devno) { + p->dev = devno; + return GA_NO_ERROR; +} + +int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno) { + p->dev = platno << 16 || devno; + return GA_NO_ERROR; +} + +int gpucontext_props_sched(gpucontext_props *p, int sched) { + if (sched == GA_CTX_SCHED_MULTI) + FLSET(p->flags, GA_CTX_MULTI_THREAD); + else + FLCLR(p->flags, GA_CTX_MULTI_THREAD); + + switch (sched) { + case GA_CTX_SCHED_MULTI: + case GA_CTX_SCHED_AUTO: + case GA_CTX_SCHED_SINGLE: + p->sched = sched; + return GA_NO_ERROR; + default: + return error_fmt(global_err, GA_INVALID_ERROR, "Invalid value for sched: %d", sched); + } +} + +int gpucontext_props_set_single_treams(gpucontext_props *p) { + p->flags |= GA_CTX_SINGLE_STREAM; + return GA_NO_ERROR; +} + +int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path) { + p->kernel_cache_path = path; + return GA_NO_ERROR; +} + +int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max) { + if (initial > max) + return error_set(global_err, GA_VALUE_ERROR, "Initial size can't be bigger than max size"); + p->initial_cache_size = initial; + p->max_cache_size = max; + return GA_NO_ERROR; +} + +void gpucontext_props_del(gpucontext_props *p) { + free(p); +} + +int gpucontext_init(gpucontext **res, const char *name, gpucontext_props *p) { const gpuarray_buffer_ops *ops = gpuarray_get_ops(name); - if (ops == NULL) FAIL(NULL, global_err); - res = ops->buffer_init(dev, flags); - if (res == NULL) FAIL(NULL, global_err); - res->ops = ops; - res->extcopy_cache = NULL; - return res; + gpucontext *r; + if (ops == NULL) { + gpucontext_props_del(p); + return global_err->code; + } + if (p == NULL && gpucontext_props_new(&p) != GA_NO_ERROR) + return global_err->code; + r = ops->buffer_init(p); + gpucontext_props_del(p); + if (r == NULL) return global_err->code; + r->ops = ops; + r->extcopy_cache = NULL; + *res = r; + return GA_NO_ERROR; } void gpucontext_deref(gpucontext *ctx) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 4e3cf42ce3..f894e72c3e 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -46,6 +46,7 @@ static void cuda_freekernel(gpukernel *); static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *); static int cuda_waits(gpudata *, int, CUstream); static int cuda_records(gpudata *, int, CUstream); +static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags); static int detect_arch(const char *prefix, char *ret, error *e); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); @@ -209,11 +210,11 @@ static int cuda_get_device_count(unsigned int platform, return GA_NO_ERROR; } -cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { +cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p) { cuda_context *res; cache *mem_cache; - char *cache_path; - void *p; + const char *cache_path; + void *pp; CUresult err; int e; @@ -229,7 +230,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { res->ctx = ctx; res->ops = &cuda_ops; res->refcnt = 1; - res->flags = flags; + res->flags = p->flags; + res->max_cache_size = p->max_cache_size; res->enter = 0; res->major = major; res->minor = minor; @@ -270,7 +272,9 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { goto fail_cache; } - cache_path = getenv("GPUARRAY_CACHE_PATH"); + cache_path = p->kernel_cache_path; + if (cache_path == NULL) + cache_path = getenv("GPUARRAY_CACHE_PATH"); if (cache_path != NULL) { mem_cache = cache_lru(64, 8, (cache_eq_fn)disk_eq, @@ -300,24 +304,26 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { res->disk_cache = NULL; } - err = cuMemAllocHost(&p, 16); + err = cuMemAllocHost(&pp, 16); if (err != CUDA_SUCCESS) { error_cuda(global_err, "cuMemAllocHost", err); goto fail_errbuf; } - memset(p, 0, 16); + memset(pp, 0, 16); /* Need to tag for new_gpudata */ TAG_CTX(res); - res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16); + res->errbuf = new_gpudata(res, (CUdeviceptr)pp, 16); if (res->errbuf == NULL) { /* Copy the error from the context since we are getting rid of it */ error_set(global_err, res->err->code, res->err->msg); goto fail_end; } res->errbuf->flags |= CUDA_MAPPED_PTR; + /* Prime the cache */ + cuda_alloc((gpucontext *)res, p->initial_cache_size, NULL, 0); return res; fail_end: - cuMemFreeHost(p); + cuMemFreeHost(pp); fail_errbuf: if (res->disk_cache) cache_destroy(res->disk_cache); @@ -515,19 +521,26 @@ static const char CUDA_PREAMBLE[] = /* XXX: add complex, quads, longlong */ /* XXX: add vector types */ -static cuda_context *do_init(CUdevice dev, int flags, error *e) { +static cuda_context *do_init(CUdevice dev, gpucontext_props *p, error *e) { cuda_context *res; CUcontext ctx; CUresult err; - unsigned int fl = CU_CTX_SCHED_AUTO; + unsigned int fl = 0; unsigned int cur_fl; int act; int i; - if (flags & GA_CTX_SINGLE_THREAD) + switch (p->sched) { + case GA_CTX_SCHED_AUTO: + fl = CU_CTX_SCHED_AUTO; + break; + case GA_CTX_SCHED_SINGLE: fl = CU_CTX_SCHED_SPIN; - if (flags & GA_CTX_MULTI_THREAD) + break; + case GA_CTX_SCHED_MULTI: fl = CU_CTX_SCHED_BLOCKING_SYNC; + break; + } err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); CHKFAIL(e, "cuDeviceGetAttribute", NULL); if (i != 1) { @@ -549,7 +562,7 @@ static cuda_context *do_init(CUdevice dev, int flags, error *e) { CHKFAIL(e, "cuDevicePrimaryCtxRetain", NULL); err = cuCtxPushCurrent(ctx); CHKFAIL(e, "cuCtxPushCurrent", NULL); - res = cuda_make_ctx(ctx, flags); + res = cuda_make_ctx(ctx, p); if (res == NULL) { cuDevicePrimaryCtxRelease(dev); if (e != global_err) @@ -573,7 +586,7 @@ static cuda_context *do_init(CUdevice dev, int flags, error *e) { return res; } -static gpucontext *cuda_init(int ord, int flags) { +static gpucontext *cuda_init(gpucontext_props *p) { CUdevice dev; cuda_context *res; CUresult err; @@ -584,23 +597,23 @@ static gpucontext *cuda_init(int ord, int flags) { return NULL; } - if (ord == -1) { + if (p->dev == -1) { int i, c; err = cuDeviceGetCount(&c); CHKFAIL(global_err, "cuDeviceGetCount", NULL); for (i = 0; i < c; i++) { err = cuDeviceGet(&dev, i); CHKFAIL(global_err, "cuDeviceGet", NULL); - res = do_init(dev, flags, global_err); + res = do_init(dev, p, global_err); if (res != NULL) return (gpucontext *)res; } error_set(global_err, GA_NODEV_ERROR, "No cuda device available"); return NULL; } else { - err = cuDeviceGet(&dev, ord); + err = cuDeviceGet(&dev, p->dev); CHKFAIL(global_err, "cuDeviceGet", NULL); - return (gpucontext *)do_init(dev, flags, global_err); + return (gpucontext *)do_init(dev, p, global_err); } } @@ -654,8 +667,11 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, *prev = NULL; - if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE)) + if (ctx->max_cache_size != 0) { if (size < BLOCK_SIZE) size = BLOCK_SIZE; + if (ctx->cache_size + size > ctx->max_cache_size) + return error_set(ctx->err, GA_VALUE_ERROR, "Maximum cache size reached"); + } cuda_enter(ctx); @@ -674,6 +690,8 @@ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, return ctx->err->code; } + ctx->cache_size += size; + (*res)->flags |= CUDA_HEAD_ALLOC; /* Now that the block is allocated, enter it in the freelist */ @@ -758,7 +776,7 @@ static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags) { * to a multiple of FRAG_SIZE. This also ensures that if we split a * block, the next block starts properly aligned for any data type. */ - if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE)) { + if (ctx->max_cache_size != 0) { asize = roundup(size, FRAG_SIZE); find_best(ctx, &res, &prev, asize); } else { @@ -843,7 +861,7 @@ static void cuda_free(gpudata *d) { } else if (d->flags & CUDA_IPC_MEMORY) { cuIpcCloseMemHandle(d->ptr); deallocate(d); - } else if (ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE) { + } else if (ctx->max_cache_size == 0) { /* Just free the pointer */ cuMemFree(d->ptr); deallocate(d); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index d41822aaf5..f3fd3cd527 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -105,7 +105,7 @@ static cl_device_id get_dev(cl_context ctx, error *e) { return res; } -cl_ctx *cl_make_ctx(cl_context ctx, int flags) { +cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { cl_ctx *res; cl_device_id id; cl_command_queue_properties qprop; @@ -159,7 +159,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, int flags) { res->preamble = NULL; res->q = clCreateCommandQueue( ctx, id, - ISSET(flags, GA_CTX_SINGLE_STREAM) ? 0 : qprop&CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + ISSET(p->flags, GA_CTX_SINGLE_STREAM) ? 0 : qprop&CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); if (res->q == NULL) { error_cl(global_err, "clCreateCommandQueue", err); @@ -415,7 +415,7 @@ errcb(const char *errinfo, const void *pi, size_t cb, void *u) { fprintf(stderr, "%s\n", errinfo); } -static gpucontext *cl_init(int devno, int flags) { +static gpucontext *cl_init(gpucontext_props *pp) { cl_device_id *ds; cl_device_id d; cl_platform_id *ps; @@ -429,10 +429,11 @@ static gpucontext *cl_init(int devno, int flags) { cl_ctx *res; cl_int err; int platno; + int devno; int e; - platno = devno >> 16; - devno &= 0xFFFF; + platno = pp->dev >> 16; + devno = pp->dev & 0xFFFF; e = setup_lib(global_err); if (e != GA_NO_ERROR) @@ -487,7 +488,7 @@ static gpucontext *cl_init(int devno, int flags) { return NULL; } - res = cl_make_ctx(ctx, flags); + res = cl_make_ctx(ctx, pp); clReleaseContext(ctx); return (gpucontext *)res; } diff --git a/src/private.h b/src/private.h index 6cf0506baa..062af008f9 100644 --- a/src/private.h +++ b/src/private.h @@ -53,6 +53,19 @@ typedef struct _gpuarray_comm_ops gpuarray_comm_ops; char bin_id[64]; \ char tag[8] +/* These will go away eventually but are kept to ease the transition for now */ +#define GA_CTX_SINGLE_STREAM 0x01 +#define GA_CTX_MULTI_THREAD 0x02 + +struct _gpucontext_props { + int dev; + int sched; + int flags; + const char *kernel_cache_path; + size_t max_cache_size; + size_t initial_cache_size; +}; + struct _gpucontext { GPUCONTEXT_HEAD; void *ctx_ptr; @@ -77,7 +90,7 @@ typedef struct _partial_gpucomm { struct _gpuarray_buffer_ops { int (*get_platform_count)(unsigned int* platcount); int (*get_device_count)(unsigned int platform, unsigned int* devcount); - gpucontext *(*buffer_init)(int dev, int flags); + gpucontext *(*buffer_init)(gpucontext_props *props); void (*buffer_deinit)(gpucontext *ctx); gpudata *(*buffer_alloc)(gpucontext *ctx, size_t sz, void *data, int flags); void (*buffer_retain)(gpudata *b); diff --git a/src/private_cuda.h b/src/private_cuda.h index 8903f8a5a8..f60b961d37 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -73,6 +73,8 @@ typedef struct _cuda_context { CUstream s; CUstream mem_s; gpudata *freeblocks; + size_t cache_size; + size_t max_cache_size; cache *kernel_cache; cache *disk_cache; // This is per-context to avoid lock contention unsigned int enter; @@ -102,7 +104,7 @@ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext), #define ARCH_PREFIX "compute_" -cuda_context *cuda_make_ctx(CUcontext ctx, int flags); +cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p); CUstream cuda_get_stream(cuda_context *ctx); void cuda_enter(cuda_context *ctx); void cuda_exit(cuda_context *ctx); diff --git a/src/private_opencl.h b/src/private_opencl.h index b3aed92a25..cc3fac7566 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -104,7 +104,7 @@ struct _gpukernel { #endif }; -cl_ctx *cl_make_ctx(cl_context ctx, int flags); +cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p); cl_command_queue cl_get_stream(gpucontext *ctx); gpudata *cl_make_buf(gpucontext *c, cl_mem buf); cl_mem cl_get_buf(gpudata *g); From 6499147dc46594101ac6f039525fabb0bdb48070 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 3 Aug 2017 14:43:08 -0400 Subject: [PATCH 432/597] Adapt the tests to deal with changes. --- tests/device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/device.c b/tests/device.c index 6c8382fe36..802bd0bcd6 100644 --- a/tests/device.c +++ b/tests/device.c @@ -5,6 +5,7 @@ #include #include "gpuarray/buffer.h" +#include "gpuarray/error.h" char* dev_name = NULL; @@ -59,7 +60,7 @@ void setup(void) { int dev = get_env_dev(&name); if (dev == -1) ck_abort_msg("Bad test device"); - ctx = gpucontext_init(name, dev, 0, NULL); + ck_assert_int_eq(gpucontext_init(&ctx, name, NULL), GA_NO_ERROR); ck_assert_ptr_ne(ctx, NULL); } From 71e0dcc80b61d15f6a4ce754f5f23f337adb229d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 3 Aug 2017 15:11:18 -0400 Subject: [PATCH 433/597] Adapt pygpu to the changes. --- pygpu/gpuarray.pxd | 24 +++++++++++++------ pygpu/gpuarray.pyx | 60 +++++++++++++++++++++++++++------------------- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 0b885671c7..76f2e6f933 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -65,6 +65,8 @@ cdef extern from "gpuarray/error.h": GA_UNALIGNED_ERROR, GA_COPY_ERROR, GA_COMM_ERROR cdef extern from "gpuarray/buffer.h": + ctypedef struct gpucontext_props: + pass ctypedef struct gpucontext: pass ctypedef struct gpudata: @@ -74,7 +76,17 @@ cdef extern from "gpuarray/buffer.h": int gpu_get_platform_count(const char* name, unsigned int* platcount) int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount) - gpucontext *gpucontext_init(const char *name, int devno, int flags, int *ret) + + int gpucontext_props_new(gpucontext_props **res) + int gpucontext_props_cuda_dev(gpucontext_props *p, int devno) + int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno) + int gpucontext_props_sched(gpucontext_props *p, int sched) + int gpucontext_props_set_single_stream(gpucontext_props *p) + int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path) + int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max) + void gpucontext_props_del(gpucontext_props *p) + + int gpucontext_init(gpucontext **res, const char *name, gpucontext_props *p) void gpucontext_deref(gpucontext *ctx) char *gpucontext_error(gpucontext *ctx, int err) int gpudata_property(gpudata *ctx, int prop_id, void *res) @@ -83,11 +95,9 @@ cdef extern from "gpuarray/buffer.h": gpucontext *gpudata_context(gpudata *) gpucontext *gpukernel_context(gpukernel *) - int GA_CTX_DEFAULT - int GA_CTX_MULTI_THREAD - int GA_CTX_SINGLE_THREAD - int GA_CTX_SINGLE_STREAM - int GA_CTX_DISABLE_ALLOCATION_CACHE + int GA_CTX_SCHED_AUTO + int GA_CTX_SCHED_SINGLE + int GA_CTX_SCHED_MULTI int GA_CTX_PROP_DEVNAME int GA_CTX_PROP_UNIQUE_ID @@ -279,7 +289,7 @@ cdef api GpuContext pygpu_default_context() cdef api bint pygpu_GpuArray_Check(object o) -cdef api GpuContext pygpu_init(object dev, int flags) +cdef api GpuContext pygpu_init(object dev, gpucontext_props *p) cdef api GpuArray pygpu_zeros(unsigned int nd, const size_t *dims, int typecode, ga_order order, diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 109425f227..a7d54545a7 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -572,13 +572,17 @@ def count_devices(kind, unsigned int platform): raise get_exc(err), gpucontext_error(NULL, err) return devcount -cdef GpuContext pygpu_init(dev, int flags): +cdef GpuContext pygpu_init(dev, gpucontext_props *p): + cdef int err + cdef GpuContext res + if dev.startswith('cuda'): kind = b"cuda" if dev[4:] == '': devnum = -1 else: devnum = int(dev[4:]) + gpucontext_props_cuda_dev(p, devnum) elif dev.startswith('opencl'): kind = b"opencl" devspec = dev[6:].split(':') @@ -587,10 +591,16 @@ cdef GpuContext pygpu_init(dev, int flags): if not devspec[0].isdigit() or not devspec[1].isdigit(): raise ValueError, "OpenCL name incorrect. Should be opencl: instead got: " + dev else: - devnum = int(devspec[0]) << 16 | int(devspec[1]) + gpucontext_props_opencl_dev(p, int(devspec[0]), int(devspec[1])) else: raise ValueError, "Unknown device format:" + dev - return GpuContext(kind, devnum, flags) + + res = GpuContext.__new__(GpuContext) + res.kind = kind + err = gpucontext_init(&res.ctx, res.kind, p) + if err != GA_NO_ERROR: + raise get_exc(err), gpucontext_error(NULL, err) + return res def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): """ @@ -629,18 +639,26 @@ def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): enable single stream mode """ - cdef int flags = 0 - if sched == 'single': - flags |= GA_CTX_SINGLE_THREAD - elif sched == 'multi': - flags |= GA_CTX_MULTI_THREAD - elif sched != 'default': - raise TypeError('unexpected value for parameter sched: %s' % (sched,)) - if disable_alloc_cache: - flags |= GA_CTX_DISABLE_ALLOCATION_CACHE - if single_stream: - flags |= GA_CTX_SINGLE_STREAM - return pygpu_init(dev, flags) + cdef gpucontext_props *p = NULL + cdef int err + err = gpucontext_props_new(&p) + if err != GA_NO_ERROR: + raise MemoryError + try: + if sched == 'single': + gpucontext_props_sched(p, GA_CTX_SCHED_SINGLE) + elif sched == 'multi': + gpucontext_props_sched(p, GA_CTX_SCHED_MULTI) + elif sched != 'default': + raise TypeError('unexpected value for parameter sched: %s' % (sched,)) + if disable_alloc_cache: + gpucontext_props_alloc_cache(p, 0, 0); + if single_stream: + gpucontext_props_set_single_stream(p); + except: + gpucontext_props_del(p) + raise + return pygpu_init(dev, p) def zeros(shape, dtype=GA_DOUBLE, order='C', GpuContext context=None, cls=None): @@ -1026,8 +1044,6 @@ cuda_exit = gpuarray_get_extension("cuda_exit") cdef class GpuContext: """ - GpuContext(kind, devno, flags) - Class that holds all the information pertaining to a context. The currently implemented modules (for the `kind` parameter) are @@ -1057,13 +1073,9 @@ cdef class GpuContext: def __reduce__(self): raise RuntimeError, "Cannot pickle GpuContext object" - def __cinit__(self, bytes kind, devno, int flags): - cdef int err = GA_NO_ERROR - cdef gpucontext *ctx - self.kind = kind - self.ctx = gpucontext_init(self.kind, devno, flags, &err) - if (err != GA_NO_ERROR): - raise get_exc(err), gpucontext_error(NULL, err) + def __init__(self): + if type(self) is GpuContext: + raise RuntimeError, "Called raw GpuContext.__init__" def __enter__(self): if cuda_enter == NULL: From 748ad285764ce914a4fef3389ed1dd07ccf40e8c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 3 Aug 2017 16:24:35 -0400 Subject: [PATCH 434/597] Bump all the versions. --- setup.py | 6 +++--- src/CMakeLists.txt | 2 +- src/gpuarray/config.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index c8f4b5e0b0..dc8c1cbec4 100755 --- a/setup.py +++ b/setup.py @@ -4,9 +4,9 @@ have_cython = False MAJOR = 0 -MINOR = 6 -PATCH = 9 -SUFFIX = '' # include the '.' +MINOR = 7 +PATCH = 0 +SUFFIX = '.dev0' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a45db024ff..d9ddc97ee8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,7 +89,7 @@ set_target_properties(gpuarray PROPERTIES INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version - VERSION 2.1 + VERSION 3.0 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index fb452cc2be..a30639155a 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -3,7 +3,7 @@ /* The following included file should have been generated by CMake. */ #include -#define GPUARRAY_API_VERSION 1 +#define GPUARRAY_API_VERSION 2 #ifdef GPUARRAY_SHARED #ifdef _WIN32 From e27384d083c6207e617f7ac9e80045dbc577ce7b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 3 Aug 2017 16:56:59 -0400 Subject: [PATCH 435/597] Expose extra options through pygpu.init() --- pygpu/gpuarray.pyx | 13 +++++++++---- src/gpuarray/buffer.h | 2 -- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index a7d54545a7..0339954dc7 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -602,9 +602,11 @@ cdef GpuContext pygpu_init(dev, gpucontext_props *p): raise get_exc(err), gpucontext_error(NULL, err) return res -def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): +def init(dev, sched='default', single_stream=False, kernel_cache_path=None, + max_cache_size=0, initial_cache_size=0): """ - init(dev, sched='default', disable_alloc_cache=False, single_stream=False) + init(dev, sched='default', single_stream=False, kernel_cache_path=None, + max_cache_size=0, initial_cache_size=0) Creates a context from a device specifier. @@ -641,6 +643,7 @@ def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): """ cdef gpucontext_props *p = NULL cdef int err + cdef bytes kernel_cache_path_b err = gpucontext_props_new(&p) if err != GA_NO_ERROR: raise MemoryError @@ -651,8 +654,10 @@ def init(dev, sched='default', disable_alloc_cache=False, single_stream=False): gpucontext_props_sched(p, GA_CTX_SCHED_MULTI) elif sched != 'default': raise TypeError('unexpected value for parameter sched: %s' % (sched,)) - if disable_alloc_cache: - gpucontext_props_alloc_cache(p, 0, 0); + if kernel_cache_path: + kernel_cache_path_b = _s(kernel_cache_path) + gpucontext_props_kernel_cache(p, kernel_cache_path_b) + gpucontext_props_alloc_cache(p, max_cache_size, initial_cache_size) if single_stream: gpucontext_props_set_single_stream(p); except: diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 82523dab28..323be30f65 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -95,8 +95,6 @@ GPUARRAY_PUBLIC int gpucontext_props_alloc_cache(gpucontext_props *p, GPUARRAY_PUBLIC void gpucontext_props_del(gpucontext_props *p); -/* TODO: add new props */ - /** * Create a context on the specified device. * From 62ba404d59f68768adc400ac1d842ca0e4d48e04 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 3 Aug 2017 17:09:45 -0400 Subject: [PATCH 436/597] Fix tyop. --- src/gpuarray_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 84ea4b59a9..d237ae97c4 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -77,7 +77,7 @@ int gpucontext_props_sched(gpucontext_props *p, int sched) { } } -int gpucontext_props_set_single_treams(gpucontext_props *p) { +int gpucontext_props_set_single_stream(gpucontext_props *p) { p->flags |= GA_CTX_SINGLE_STREAM; return GA_NO_ERROR; } From c79aae19632f2078bb7245978ef0160c1c29929c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 7 Aug 2017 18:20:39 -0400 Subject: [PATCH 437/597] Remove support for binary mode when creating kernels. --- pygpu/gpuarray.pxd | 4 +- pygpu/gpuarray.pyx | 26 ++-------- src/gpuarray/buffer.h | 26 ---------- src/gpuarray/kernel.h | 3 -- src/gpuarray_buffer.c | 4 -- src/gpuarray_buffer_cuda.c | 14 ------ src/gpuarray_buffer_opencl.c | 97 ++++++++++-------------------------- src/gpuarray_kernel.c | 4 -- src/private.h | 1 - 9 files changed, 30 insertions(+), 149 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 76f2e6f933..0e855643cd 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -123,7 +123,7 @@ cdef extern from "gpuarray/buffer.h": cdef enum ga_usefl: GA_USE_CLUDA, GA_USE_SMALL, GA_USE_DOUBLE, GA_USE_COMPLEX, GA_USE_HALF, - GA_USE_BINARY, GA_USE_CUDA, GA_USE_OPENCL + GA_USE_CUDA, GA_USE_OPENCL cdef extern from "gpuarray/kernel.h": ctypedef struct _GpuKernel "GpuKernel": @@ -139,7 +139,6 @@ cdef extern from "gpuarray/kernel.h": int GpuKernel_call(_GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) - int GpuKernel_binary(_GpuKernel *, size_t *, void **) cdef extern from "gpuarray/array.h": ctypedef struct _GpuArray "GpuArray": @@ -279,7 +278,6 @@ cdef int kernel_sched(GpuKernel k, size_t n, size_t *gs, size_t *ls) except -1 cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args) except -1 -cdef int kernel_binary(GpuKernel k, size_t *, void **) except -1 cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1 cdef int ctx_property(GpuContext c, int prop_id, void *res) except -1 diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 0339954dc7..2d5e14bf5f 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -478,12 +478,6 @@ cdef int kernel_call(GpuKernel k, unsigned int n, const size_t *gs, if err != GA_NO_ERROR: raise get_exc(err), kernel_error(k, err) -cdef int kernel_binary(GpuKernel k, size_t *sz, void **bin) except -1: - cdef int err - err = GpuKernel_binary(&k.k, sz, bin) - if err != GA_NO_ERROR: - raise get_exc(err), kernel_error(k, err) - cdef int kernel_property(GpuKernel k, int prop_id, void *res) except -1: cdef int err err = gpukernel_property(k.k.k, prop_id, res) @@ -2279,7 +2273,7 @@ cdef class GpuArray: cdef class GpuKernel: """ - GpuKernel(source, name, types, context=None, cluda=True, have_double=False, have_small=False, have_complex=False, have_half=False, binary=False, cuda=False, opencl=False) + GpuKernel(source, name, types, context=None, cluda=True, have_double=False, have_small=False, have_complex=False, have_half=False, cuda=False, opencl=False) Compile a kernel on the device @@ -2332,8 +2326,6 @@ cdef class GpuKernel: ensure complex types will work? have_half: bool ensure half-floats will work? - binary: bool - kernel is pre-compiled binary blob? cuda: bool kernel is cuda code? opencl: bool @@ -2379,8 +2371,8 @@ cdef class GpuKernel: def __cinit__(self, source, name, types, GpuContext context=None, cluda=True, have_double=False, have_small=False, - have_complex=False, have_half=False, binary=False, - cuda=False, opencl=False, *a, **kwa): + have_complex=False, have_half=False, cuda=False, + opencl=False, *a, **kwa): cdef const char *s[1] cdef size_t l cdef unsigned int numargs @@ -2403,8 +2395,6 @@ cdef class GpuKernel: flags |= GA_USE_COMPLEX if have_half: flags |= GA_USE_HALF - if binary: - flags |= GA_USE_BINARY if cuda: flags |= GA_USE_CUDA if opencl: @@ -2565,13 +2555,3 @@ cdef class GpuKernel: kernel_property(self, GA_KERNEL_PROP_NUMARGS, &res) return res - property _binary: - "Kernel compiled binary for the associated context." - def __get__(self): - cdef size_t sz - cdef char *bin - kernel_binary(self, &sz, &bin) - try: - return bin[:sz] - finally: - free(bin) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 323be30f65..cd4a565629 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -468,26 +468,6 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); -/** - * (Deprecated) Get the kernel binary. - * - * This function is deprecated and will be removed in the next release. - * - * This can be use to cache kernel binaries after compilation of a - * specific device. The kernel can be recreated by calling - * gpukernel_alloc with the binary and size and passing `GA_USE_BINARY` - * as the use flags. - * - * The returned pointer is allocated and must be freed by the caller. - * - * \param k kernel - * \param sz size of the returned binary - * \param obj pointer to the binary for the kernel. - * - * \returns GA_NO_ERROR or an error code if an error occurred. - */ -GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj); - /** * Fetch a property. * @@ -750,12 +730,6 @@ typedef enum _ga_usefl { * The kernel makes use of half-floats (also known as float16) */ GA_USE_HALF = 0x10, - /** - * The source code passed is actually a kernel binary. - * - * For the cuda backend this can also be a PTX module. - */ - GA_USE_BINARY = 0x20, /* If you add a new flag, don't forget to update both gpuarray_buffer_{cuda,opencl}.c with the implementation of your flag */ /** diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index 2cfc7d12ca..6ed8a476bd 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -109,9 +109,6 @@ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); -GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz, - void **obj); - GPUARRAY_PUBLIC const char *GpuKernel_error(const GpuKernel *k, int err); #ifdef __cplusplus diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index d237ae97c4..537cd53e6e 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -253,10 +253,6 @@ int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, shared, args); } -int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) { - return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj); -} - int gpukernel_property(gpukernel *k, int prop_id, void *res) { return ((partial_gpukernel *)k)->ctx->ops->property(NULL, NULL, k, prop_id, res); diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index f894e72c3e..a0bc3de1b9 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1348,9 +1348,6 @@ static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, if (flags & GA_USE_OPENCL) return error_set(ctx->err, GA_DEVSUP_ERROR, "OpenCL kernel not supported on cuda devices"); - if (flags & GA_USE_BINARY) - return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Binary mode not supported any more"); - cuda_enter(ctx); err = cuCtxGetDevice(&dev); @@ -1578,16 +1575,6 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } -static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) { - void *res = malloc(k->bin_sz); - if (res == NULL) - return error_sys(k->ctx->err, "malloc"); - memcpy(res, k->bin, k->bin_sz); - *sz = k->bin_sz; - *obj = res; - return GA_NO_ERROR; -} - static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; @@ -1828,7 +1815,6 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_freekernel, cuda_kernelsetarg, cuda_callkernel, - cuda_kernelbin, cuda_sync, cuda_transfer, cuda_property, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index f3fd3cd527..c1d1b52192 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -850,52 +850,37 @@ static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, dev = get_dev(ctx->ctx, ctx->err); if (dev == NULL) return ctx->err->code; - if (flags & GA_USE_BINARY) { - // GA_USE_BINARY is exclusive - if (flags & ~GA_USE_BINARY) - return error_set(ctx->err, GA_INVALID_ERROR, "Cannot combine GA_USE_BINARY with any other flag"); - - // We need the length for binary data and there is only one blob. - if (count != 1 || lengths == NULL || lengths[0] == 0) - return error_set(ctx->err, GA_VALUE_ERROR, "GA_USE_BINARY requires the length to be specified"); - - p = clCreateProgramWithBinary(ctx->ctx, 1, &dev, lengths, (const unsigned char **)strings, NULL, &err); - if (err != CL_SUCCESS) - return error_cl(ctx->err, "clCreateProgramWithBinary", err); - } else { - - if (cl_check_extensions(preamble, &n, flags, ctx)) - return ctx->err->code; + if (cl_check_extensions(preamble, &n, flags, ctx)) + return ctx->err->code; - if (n != 0) { - news = calloc(count+n, sizeof(const char *)); - if (news == NULL) + if (n != 0) { + news = calloc(count+n, sizeof(const char *)); + if (news == NULL) + return error_sys(ctx->err, "calloc"); + memcpy(news, preamble, n*sizeof(const char *)); + memcpy(news+n, strings, count*sizeof(const char *)); + if (lengths == NULL) { + newl = NULL; + } else { + newl = calloc(count+n, sizeof(size_t)); + if (newl == NULL) { + free(news); return error_sys(ctx->err, "calloc"); - memcpy(news, preamble, n*sizeof(const char *)); - memcpy(news+n, strings, count*sizeof(const char *)); - if (lengths == NULL) { - newl = NULL; - } else { - newl = calloc(count+n, sizeof(size_t)); - if (newl == NULL) { - free(news); - return error_sys(ctx->err, "calloc"); - } - memcpy(newl+n, lengths, count*sizeof(size_t)); } - } else { - news = strings; - newl = (size_t *)lengths; + memcpy(newl+n, lengths, count*sizeof(size_t)); } + } else { + news = strings; + newl = (size_t *)lengths; + } - p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &err); - if (err != CL_SUCCESS) { - if (n != 0) { - free(news); - free(newl); - } - return error_cl(ctx->err, "clCreateProgramWithSource", err); + p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &err); + if (err != CL_SUCCESS) { + if (n != 0) { + free(news); + free(newl); } + return error_cl(ctx->err, "clCreateProgramWithSource", err); } err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); @@ -915,11 +900,7 @@ static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, debug_msg.l += (log_size-1); // Back off to before final '\0' } - if (flags & GA_USE_BINARY) { - // Not clear what to do with binary 'source' - the log will have to suffice - } else { - gpukernel_source_with_line_numbers(count+n, news, newl, &debug_msg); - } + gpukernel_source_with_line_numbers(count+n, news, newl, &debug_msg); strb_append0(&debug_msg); // Make sure a final '\0' is present @@ -1105,31 +1086,6 @@ static int cl_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } -static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) { - cl_ctx *ctx = k->ctx; - cl_program p; - size_t rsz; - void *res; - cl_int err; - - ASSERT_KER(k); - ASSERT_CTX(ctx); - - CL_CHECK(ctx->err, clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL)); - CL_CHECK(ctx->err, clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL)); - res = malloc(rsz); - if (res == NULL) - return error_sys(ctx->err, "malloc"); - err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); - if (err != CL_SUCCESS) { - free(res); - return error_cl(ctx->err, "clProgramGetInfo", err); - } - *sz = rsz; - *obj = res; - return GA_NO_ERROR; -} - static int cl_sync(gpudata *b) { cl_ctx *ctx = (cl_ctx *)b->ctx; @@ -1359,7 +1315,6 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_releasekernel, cl_setkernelarg, cl_callkernel, - cl_kernelbin, cl_sync, cl_transfer, cl_property, diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c index 0779afea69..818187cca0 100644 --- a/src/gpuarray_kernel.c +++ b/src/gpuarray_kernel.c @@ -98,10 +98,6 @@ int GpuKernel_call(GpuKernel *k, unsigned int n, return gpukernel_call(k->k, n, gs, ls, shared, args); } -int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) { - return gpukernel_binary(k->k, sz, bin); -} - const char *GpuKernel_error(const GpuKernel *k, int err) { return gpucontext_error(gpukernel_context(k->k), err); } diff --git a/src/private.h b/src/private.h index 062af008f9..caf87d8c65 100644 --- a/src/private.h +++ b/src/private.h @@ -112,7 +112,6 @@ struct _gpuarray_buffer_ops { const size_t *gs, const size_t *ls, size_t shared, void **args); - int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj); int (*buffer_sync)(gpudata *b); int (*buffer_transfer)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); From 1b61fead95ab9c8f277cb0da7d8d0affd8b1a26e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 8 Aug 2017 12:28:47 -0400 Subject: [PATCH 438/597] Move the cluda layers into separate files that are embedded to facilitate development. Also remove the GA_USE_CLUDA flag in favor of header inclusion. --- pygpu/gpuarray.pxd | 2 +- pygpu/gpuarray.pyx | 13 ++-- pygpu/reduction.py | 4 +- src/cluda_cuda.h | 45 +++++++++++++ src/cluda_cuda.h.c | 119 +++++++++++++++++++++++++++++++++++ src/cluda_opencl.h | 40 ++++++++++++ src/cluda_opencl.h.c | 107 +++++++++++++++++++++++++++++++ src/gpuarray/buffer.h | 5 +- src/gpuarray_array.c | 5 +- src/gpuarray_buffer_cuda.c | 63 ++----------------- src/gpuarray_buffer_opencl.c | 91 +++++++++------------------ src/gpuarray_elemwise.c | 6 +- src/gpuarray_reduction.c | 3 +- src/head.py | 35 +++++++++++ src/loaders/libopencl.fn | 1 + src/private_opencl.h | 2 +- 16 files changed, 402 insertions(+), 139 deletions(-) create mode 100644 src/cluda_cuda.h create mode 100644 src/cluda_cuda.h.c create mode 100644 src/cluda_opencl.h create mode 100644 src/cluda_opencl.h.c create mode 100644 src/head.py diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 0e855643cd..5180f56d94 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -122,7 +122,7 @@ cdef extern from "gpuarray/buffer.h": int GA_KERNEL_PROP_TYPES cdef enum ga_usefl: - GA_USE_CLUDA, GA_USE_SMALL, GA_USE_DOUBLE, GA_USE_COMPLEX, GA_USE_HALF, + GA_USE_SMALL, GA_USE_DOUBLE, GA_USE_COMPLEX, GA_USE_HALF, GA_USE_CUDA, GA_USE_OPENCL cdef extern from "gpuarray/kernel.h": diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 2d5e14bf5f..d505e70d5a 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -2273,7 +2273,7 @@ cdef class GpuArray: cdef class GpuKernel: """ - GpuKernel(source, name, types, context=None, cluda=True, have_double=False, have_small=False, have_complex=False, have_half=False, cuda=False, opencl=False) + GpuKernel(source, name, types, context=None, have_double=False, have_small=False, have_complex=False, have_half=False, cuda=False, opencl=False) Compile a kernel on the device @@ -2316,8 +2316,6 @@ cdef class GpuKernel: list of argument types context: GpuContext device on which the kernel is compiled - cluda: bool - use cluda layer? have_double: bool ensure working doubles? have_small: bool @@ -2333,7 +2331,7 @@ cdef class GpuKernel: Notes ----- - With the cuda backend, unless you use `cluda=True`, you must + With the cuda backend, unless you use the cluda include, you must either pass the mangled name of your kernel or declare the function 'extern "C"', because cuda uses a C++ compiler unconditionally. @@ -2370,9 +2368,8 @@ cdef class GpuKernel: raise RuntimeError, "Cannot pickle GpuKernel object" def __cinit__(self, source, name, types, GpuContext context=None, - cluda=True, have_double=False, have_small=False, - have_complex=False, have_half=False, cuda=False, - opencl=False, *a, **kwa): + have_double=False, have_small=False, have_complex=False, + have_half=False, cuda=False, opencl=False, *a, **kwa): cdef const char *s[1] cdef size_t l cdef unsigned int numargs @@ -2385,8 +2382,6 @@ cdef class GpuKernel: self.context = ensure_context(context) - if cluda: - flags |= GA_USE_CLUDA if have_double: flags |= GA_USE_DOUBLE if have_small: diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 2c16508ab6..df9482a072 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -31,6 +31,8 @@ def _ceil_log2(x): basic_kernel = Template(""" +#include + ${preamble} #define REDUCE(a, b) (${reduce_expr}) @@ -230,7 +232,7 @@ def _gen_basic(self, ls, nd): spec.append('uint32') spec.extend('int32' for _ in range(nd)) k = gpuarray.GpuKernel(src, "reduk", spec, context=self.context, - cluda=True, **self.flags) + **self.flags) return k, src, spec @lru_cache() diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h new file mode 100644 index 0000000000..28bcdff13d --- /dev/null +++ b/src/cluda_cuda.h @@ -0,0 +1,45 @@ +#define local_barrier() __syncthreads() +#define WITHIN_KERNEL extern \"C\" __device__ +#define KERNEL extern \"C\" __global__ +#define GLOBAL_MEM /* empty */ +#define LOCAL_MEM __shared__ +#define LOCAL_MEM_ARG /* empty */ +#ifdef NAN +#undef NAN +#endif +#define NAN __int_as_float(0x7fffffff) +#ifdef INFINITY +#undef INFINITY +#endif +#define INFINITY __int_as_float(0x7f800000) +#define LID_0 threadIdx.x +#define LID_1 threadIdx.y +#define LID_2 threadIdx.z +#define LDIM_0 blockDim.x +#define LDIM_1 blockDim.y +#define LDIM_2 blockDim.z +#define GID_0 blockIdx.x +#define GID_1 blockIdx.y +#define GID_2 blockIdx.z +#define GDIM_0 gridDim.x +#define GDIM_1 gridDim.y +#define GDIM_2 gridDim.z +#define ga_bool unsigned char +#define ga_byte signed char +#define ga_ubyte unsigned char +#define ga_short short +#define ga_ushort unsigned short +#define ga_int int +#define ga_uint unsigned int +#define ga_long long long +#define ga_ulong unsigned long long +#define ga_float float +#define ga_double double +#define ga_half ga_ushort +#define ga_size size_t +#define ga_ssize ptrdiff_t +#define load_half(p) __half2float(*(p)) +#define store_half(p, v) (*(p) = __float2half_rn(v)) +#define GA_DECL_SHARED_PARAM(type, name) +#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[]; +#define GA_WARP_SIZE warpSize diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c new file mode 100644 index 0000000000..6eb55734aa --- /dev/null +++ b/src/cluda_cuda.h.c @@ -0,0 +1,119 @@ +static const char cluda_cuda_h[] = { +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, +0x61, 0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, +0x29, 0x20, 0x5f, 0x5f, 0x73, 0x79, 0x6e, 0x63, 0x74, 0x68, 0x72, +0x65, 0x61, 0x64, 0x73, 0x28, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, 0x5f, +0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, +0x72, 0x6e, 0x20, 0x5c, 0x22, 0x43, 0x5c, 0x22, 0x20, 0x5f, 0x5f, +0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, +0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5c, 0x22, +0x43, 0x5c, 0x22, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, +0x6c, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x47, 0x4c, 0x4f, 0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, +0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, +0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, +0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, +0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, +0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, +0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, +0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, 0x5f, 0x5f, 0x69, 0x6e, +0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, +0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x29, +0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, +0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, +0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, +0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, +0x59, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x38, +0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x74, +0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, +0x5f, 0x31, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, +0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x74, 0x68, 0x72, 0x65, +0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, +0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x78, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, +0x49, 0x4d, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, +0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x62, 0x6c, +0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, +0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x78, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, +0x44, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, +0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, +0x6b, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, +0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, +0x5f, 0x31, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, +0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, +0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, +0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, +0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x73, 0x69, 0x67, 0x6e, +0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, +0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, +0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, +0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, +0x72, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, +0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, +0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x6e, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, +0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, +0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, +0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, +0x2a, 0x28, 0x70, 0x29, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, 0x76, 0x29, 0x20, 0x28, 0x2a, +0x28, 0x70, 0x29, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, +0x76, 0x29, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, +0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, +0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, +0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, +0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x65, 0x78, 0x74, +0x65, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, +0x64, 0x5f, 0x5f, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, +0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, +0x53, 0x49, 0x5a, 0x45, 0x20, 0x77, 0x61, 0x72, 0x70, 0x53, 0x69, +0x7a, 0x65, 0x0a, 0x00}; diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h new file mode 100644 index 0000000000..684872cce8 --- /dev/null +++ b/src/cluda_opencl.h @@ -0,0 +1,40 @@ +#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE) +#define WITHIN_KERNEL /* empty */ +#define KERNEL __kernel +#define GLOBAL_MEM __global +#define LOCAL_MEM __local +#define LOCAL_MEM_ARG __local +#ifndef NULL + #define NULL ((void*)0) +#endif +#define LID_0 get_local_id(0) +#define LID_1 get_local_id(1) +#define LID_2 get_local_id(2) +#define LDIM_0 get_local_size(0) +#define LDIM_1 get_local_size(1) +#define LDIM_2 get_local_size(2) +#define GID_0 get_group_id(0) +#define GID_1 get_group_id(1) +#define GID_2 get_group_id(2) +#define GDIM_0 get_num_groups(0) +#define GDIM_1 get_num_groups(1) +#define GDIM_2 get_num_groups(2) +#define ga_bool uchar +#define ga_byte char +#define ga_ubyte uchar +#define ga_short short +#define ga_ushort ushort +#define ga_int int +#define ga_uint uint +#define ga_long long +#define ga_ulong ulong +#define ga_float float +#define ga_double double +#define ga_half half +#define ga_size ulong +#define ga_ssize long +#define load_half(p) vload_half(0, p) +#define store_half(p, v) vstore_half_rtn(v, 0, p) +#define GA_DECL_SHARED_PARAM(type, name) , __local type *name +#define GA_DECL_SHARED_BODY(type, name) +#define GA_WARP_SIZE __GA_WARP_SIZE diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c new file mode 100644 index 0000000000..a7aa914300 --- /dev/null +++ b/src/cluda_opencl.h.c @@ -0,0 +1,107 @@ +static const char cluda_opencl_h[] = { +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, +0x61, 0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, +0x29, 0x20, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x43, +0x4c, 0x4b, 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, +0x4d, 0x5f, 0x46, 0x45, 0x4e, 0x43, 0x45, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, +0x4e, 0x5f, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x2f, 0x2a, +0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, +0x45, 0x4c, 0x20, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, +0x4f, 0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, +0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, +0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, +0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, +0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x23, 0x69, 0x66, +0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x0a, 0x20, +0x20, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x55, +0x4c, 0x4c, 0x20, 0x28, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x2a, 0x29, +0x30, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, +0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, +0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x67, +0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, +0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x32, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, +0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x30, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, +0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x31, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, +0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x32, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, +0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, +0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, +0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, +0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, +0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, +0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, +0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, +0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, +0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, 0x63, 0x68, +0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x63, 0x68, 0x61, +0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x63, 0x68, +0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, +0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, +0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, +0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x69, +0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6c, 0x6f, +0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x68, 0x61, 0x6c, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, +0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x70, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, +0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, +0x76, 0x29, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x76, 0x2c, 0x20, +0x30, 0x2c, 0x20, 0x70, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, +0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, +0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, +0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, +0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, +0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, +0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, +0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, +0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, +0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, +0x0a, 0x00}; diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index cd4a565629..5a242d83b3 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -710,10 +710,7 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k); * cases result in silent data corruption (especially on ATI cards). */ typedef enum _ga_usefl { - /** - * The kernel source uses CLUDA unified language. - */ - GA_USE_CLUDA = 0x01, + /* UNUSED: 0x01 */ /** * The kernel makes use of small (size is smaller than 4 bytes) types. */ diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 6cf7daee58..7ecea86bf6 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -366,7 +366,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, char *sz, *ssz; unsigned int i, i2; unsigned int nargs, apos; - int flags = GA_USE_CLUDA; + int flags = 0; int res; nargs = 9 + 2 * v->nd; @@ -384,7 +384,8 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, } apos = 0; - strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " + strb_appendf(&sb, "#include \n" + "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index a0bc3de1b9..6d3dd24c67 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -22,6 +22,8 @@ #include "gpuarray/extension.h" +#include "cluda_cuda.h.c" + STATIC_ASSERT(DONTFREE == GPUARRAY_CUDA_CTX_NOFREE, cuda_nofree_eq); STATIC_ASSERT(CUDA_WAIT_READ == GPUARRAY_CUDA_WAIT_READ, cuda_wait_read_eq); STATIC_ASSERT(CUDA_WAIT_WRITE == GPUARRAY_CUDA_WAIT_WRITE, cuda_wait_write_eq); @@ -470,57 +472,6 @@ size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; } return v; \ } -static const char CUDA_PREAMBLE[] = - "#define local_barrier() __syncthreads()\n" - "#define WITHIN_KERNEL extern \"C\" __device__\n" - "#define KERNEL extern \"C\" __global__\n" - "#define GLOBAL_MEM /* empty */\n" - "#define LOCAL_MEM __shared__\n" - "#define LOCAL_MEM_ARG /* empty */\n" - "#ifdef NAN\n" - "#undef NAN\n" - "#endif\n" - "#define NAN __int_as_float(0x7fffffff)\n" - "#ifdef INFINITY\n" - "#undef INFINITY\n" - "#endif\n" - "#define INFINITY __int_as_float(0x7f800000)\n" - "#define LID_0 threadIdx.x\n" - "#define LID_1 threadIdx.y\n" - "#define LID_2 threadIdx.z\n" - "#define LDIM_0 blockDim.x\n" - "#define LDIM_1 blockDim.y\n" - "#define LDIM_2 blockDim.z\n" - "#define GID_0 blockIdx.x\n" - "#define GID_1 blockIdx.y\n" - "#define GID_2 blockIdx.z\n" - "#define GDIM_0 gridDim.x\n" - "#define GDIM_1 gridDim.y\n" - "#define GDIM_2 gridDim.z\n" - "#define ga_bool unsigned char\n" - "#define ga_byte signed char\n" - "#define ga_ubyte unsigned char\n" - "#define ga_short short\n" - "#define ga_ushort unsigned short\n" - "#define ga_int int\n" - "#define ga_uint unsigned int\n" - "#define ga_long long long\n" - "#define ga_ulong unsigned long long\n" - "#define ga_float float\n" - "#define ga_double double\n" - "#define ga_half ga_ushort\n" - "#define ga_size size_t\n" - "#define ga_ssize ptrdiff_t\n" - "#define load_half(p) __half2float(*(p))\n" - "#define store_half(p, v) (*(p) = __float2half_rn(v))\n" - "#define GA_DECL_SHARED_PARAM(type, name)\n" - "#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n" - "#define GA_WARP_SIZE warpSize\n" - "#line 1\n"; - -/* XXX: add complex, quads, longlong */ -/* XXX: add vector types */ - static cuda_context *do_init(CUdevice dev, gpucontext_props *p, error *e) { cuda_context *res; CUcontext ctx; @@ -1128,6 +1079,8 @@ static inline int error_nvrtc(error *e, const char *msg, nvrtcResult err) { static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { nvrtcProgram prog; size_t buflen; + const char *heads[1] = {"cluda.h"}; + const char *hsrc[1]; const char *opts[4] = { "-arch", "" , "-G", "-lineinfo" @@ -1136,7 +1089,8 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { opts[1] = ctx->bin_id; - err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL); + hsrc[0] = cluda_cuda_h; + err = nvrtcCreateProgram(&prog, src->s, NULL, 1, hsrc, heads); if (err != NVRTC_SUCCESS) return error_nvrtc(ctx->err, "nvrtcCreateProgram", err); @@ -1359,7 +1313,6 @@ static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, if (get_cc(dev, &major, &minor, ctx->err) != GA_NO_ERROR) return ctx->err->code; - // GA_USE_CLUDA is done later // GA_USE_SMALL will always work // GA_USE_HALF should always work if (flags & GA_USE_DOUBLE) { @@ -1374,10 +1327,6 @@ static int cuda_newkernel(gpukernel **k, gpucontext *c, unsigned int count, return error_set(ctx->err, GA_UNSUPPORTED_ERROR, "Complex support is not there yet."); } - if (flags & GA_USE_CLUDA) { - strb_appends(&src, CUDA_PREAMBLE); - } - if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&src, strings[i]); diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index c1d1b52192..3dc2d3c8ee 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -15,6 +15,8 @@ #include "loaders/libclblas.h" #include "loaders/libclblast.h" +#include "cluda_opencl.h.c" + #define _unused(x) ((void)x) #define SSIZE_MIN (-(SSIZE_MAX-1)) @@ -32,7 +34,7 @@ static int cl_newkernel(gpukernel **k, gpucontext *ctx, unsigned int count, const char *fname, unsigned int argcount, const int *types, int flags, char **err_str); static const char CL_CONTEXT_PREAMBLE[] = -"#define GA_WARP_SIZE %lu\n"; // to be filled by cl_make_ctx() +"-D __GA_WARP_SIZE=%lu"; // to be filled by cl_make_ctx() static int setup_done = 0; static int setup_lib(error *e) { @@ -156,7 +158,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { res->refcnt = 1; res->exts = NULL; res->blas_handle = NULL; - res->preamble = NULL; + res->options = NULL; res->q = clCreateCommandQueue( ctx, id, ISSET(p->flags, GA_CTX_SINGLE_STREAM) ? 0 : qprop&CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, @@ -194,8 +196,8 @@ cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { // Write the preferred workgroup multiple as GA_WARP_SIZE in preamble strb_appendf(&context_preamble, CL_CONTEXT_PREAMBLE, (unsigned long)warp_size); - res->preamble = strb_cstr(&context_preamble); - if (res->preamble == NULL) + res->options = strb_cstr(&context_preamble); + if (res->options == NULL) goto fail; res->blas_handle = NULL; @@ -234,8 +236,8 @@ static void cl_free_ctx(cl_ctx *ctx) { } clReleaseCommandQueue(ctx->q); clReleaseContext(ctx->ctx); - if (ctx->preamble != NULL) - free(ctx->preamble); + if (ctx->options != NULL) + free(ctx->options); error_free(ctx->err); CLEAR(ctx); free(ctx); @@ -291,52 +293,7 @@ static int cl_callkernel(gpukernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); -static const char CL_PREAMBLE[] = - "#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE)\n" - "#define WITHIN_KERNEL /* empty */\n" - "#define KERNEL __kernel\n" - "#define GLOBAL_MEM __global\n" - "#define LOCAL_MEM __local\n" - "#define LOCAL_MEM_ARG __local\n" - "#ifndef NULL\n" - " #define NULL ((void*)0)\n" - "#endif\n" - "#define LID_0 get_local_id(0)\n" - "#define LID_1 get_local_id(1)\n" - "#define LID_2 get_local_id(2)\n" - "#define LDIM_0 get_local_size(0)\n" - "#define LDIM_1 get_local_size(1)\n" - "#define LDIM_2 get_local_size(2)\n" - "#define GID_0 get_group_id(0)\n" - "#define GID_1 get_group_id(1)\n" - "#define GID_2 get_group_id(2)\n" - "#define GDIM_0 get_num_groups(0)\n" - "#define GDIM_1 get_num_groups(1)\n" - "#define GDIM_2 get_num_groups(2)\n" - "#define ga_bool uchar\n" - "#define ga_byte char\n" - "#define ga_ubyte uchar\n" - "#define ga_short short\n" - "#define ga_ushort ushort\n" - "#define ga_int int\n" - "#define ga_uint uint\n" - "#define ga_long long\n" - "#define ga_ulong ulong\n" - "#define ga_float float\n" - "#define ga_double double\n" - "#define ga_half half\n" - "#define ga_size ulong\n" - "#define ga_ssize long\n" - "#define load_half(p) vload_half(0, p)\n" - "#define store_half(p, v) vstore_half_rtn(v, 0, p)\n" - "#define GA_DECL_SHARED_PARAM(type, name) , __local type *name\n" - "#define GA_DECL_SHARED_BODY(type, name)\n"; - -/* XXX: add complex types, quad types, and longlong */ -/* XXX: add vector types */ - const char *cl_error_string(cl_int err) { - /* OpenCL 1.0 error codes */ switch (err) { case CL_SUCCESS: return "Success!"; case CL_DEVICE_NOT_FOUND: return "Device not found."; @@ -797,14 +754,6 @@ static int cl_memset(gpudata *dst, size_t offset, int data) { static int cl_check_extensions(const char **preamble, unsigned int *count, int flags, cl_ctx *ctx) { - if (flags & GA_USE_CLUDA) { - // add the common preamble - preamble[*count] = CL_PREAMBLE; - (*count)++; - // add the per-context preamble - preamble[*count] = ctx->preamble; - (*count)++; - } if (flags & GA_USE_SMALL) { GA_CHECK(check_ext(ctx, CL_SMALL)); preamble[*count] = PRAGMA CL_SMALL ENABLE; @@ -832,9 +781,12 @@ static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, gpukernel *res; cl_device_id dev; cl_program p; + cl_program cluda; // Sync this table size with the number of flags that can add stuff // at the beginning const char *preamble[5]; + const char *cluda_src[1]; + const char *headers[1] = {"cluda.h"}; size_t *newl = NULL; const char **news = NULL; cl_int err; @@ -874,18 +826,35 @@ static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, newl = (size_t *)lengths; } + cluda_src[0] = cluda_opencl_h; + cluda = clCreateProgramWithSource(ctx->ctx, 1, cluda_src, NULL, &err); + if (err != CL_SUCCESS) { + if (n != 0) { + free(news); + free(newl); + } + return error_cl(ctx->err, "clCreateProgramWithSource (header)", err); + } + p = clCreateProgramWithSource(ctx->ctx, count+n, news, newl, &err); if (err != CL_SUCCESS) { if (n != 0) { free(news); free(newl); + clReleaseProgram(cluda); } - return error_cl(ctx->err, "clCreateProgramWithSource", err); + return error_cl(ctx->err, "clCreateProgramWithSource (kernel)", err); } + err = clCompileProgram(p, 0, NULL, ctx->options, 1, &cluda, headers, NULL, NULL); + if (err != CL_SUCCESS) + goto compile_error; + err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); + compile_error: if (err != CL_SUCCESS) { - if (err == CL_BUILD_PROGRAM_FAILURE && err_str != NULL) { + if ((err == CL_COMPILE_PROGRAM_FAILURE || err == CL_BUILD_PROGRAM_FAILURE) + && err_str != NULL) { *err_str = NULL; // Fallback, in case there's an error // We're substituting debug_msg for a string with this first line: diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 5411949b20..c083a6420f 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -133,7 +133,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, int *ktypes; char *size = "ga_size", *ssize = "ga_ssize"; unsigned int p; - int flags = GA_USE_CLUDA; + int flags = 0; int res; if (ISSET(gen_flags, GEN_ADDR32)) { @@ -154,6 +154,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, p = 0; + strb_appends(&sb, "#include \n"); if (preamble) strb_appends(&sb, preamble); strb_appends(&sb, "\nKERNEL void elem(const ga_size n, "); @@ -451,7 +452,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, int *ktypes = NULL; unsigned int p; unsigned int j; - int flags = GA_USE_CLUDA; + int flags = 0; int res; flags |= gpuarray_type_flagsa(n, args); @@ -468,6 +469,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, p = 0; + strb_appends(&sb, "#include \n"); if (preamble) strb_appends(&sb, preamble); strb_appends(&sb, "\nKERNEL void elem(const ga_size n, "); diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index af8c78ff0e..6cccaaba97 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -322,6 +322,7 @@ static int maxandargmaxGenSource (maxandargmax_ctx* ctx){ return ctx->ret=GA_NO_ERROR; } static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx){ + strb_appends (&ctx->s, "#include \n"); maxandargmaxAppendTypedefs (ctx); maxandargmaxAppendPrototype (ctx); strb_appends (&ctx->s, "{\n"); @@ -657,7 +658,7 @@ static int maxandargmaxCompile (maxandargmax_ctx* ctx){ "maxandargmax", ARG_TYPECODES_LEN, ARG_TYPECODES, - GA_USE_CLUDA, + 0, (char**)0); free(ctx->sourceCode); ctx->sourceCode = NULL; diff --git a/src/head.py b/src/head.py new file mode 100644 index 0000000000..2202ed542b --- /dev/null +++ b/src/head.py @@ -0,0 +1,35 @@ +def wrt(f, n, b): + f.write(b',') + n += 1 + if n > 10: + f.write(b'\n') + n = 0 + else: + f.write(b' ') + f.write(b"0x%02x" % (b,)) + return n + + +def convert(src, dst): + src_name = src.replace('.', '_') + with open(src, 'rb') as f: + src_data = f.read() + with open(dst, 'wb') as f: + f.write(b'static const char %s[] = {\n' % (src_name.encode('utf-8'),)) + first = True + n = 0 + for b in src_data: + if b == 0: + raise ValueError('NUL in file') + if first: + f.write(b"0x%02x" % (b,)) + first = False + else: + n = wrt(f, n, b) + wrt(f, n, 0) + f.write(b'};\n') + + +if __name__ == '__main__': + import sys + convert(sys.argv[1], sys.argv[1] + '.c') diff --git a/src/loaders/libopencl.fn b/src/loaders/libopencl.fn index 03293ac502..26040501df 100644 --- a/src/loaders/libopencl.fn +++ b/src/loaders/libopencl.fn @@ -1,5 +1,6 @@ DEF_PROC(cl_context, clCreateContext, (const cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *)); DEF_PROC(cl_int, clBuildProgram, (cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *)); +DEF_PROC(cl_int, clCompileProgram, (cl_program, cl_uint, const cl_device_id *, const char *, cl_uint, cl_program *, const char **, void (CL_CALLBACK *)(cl_program, void *), void *)); DEF_PROC(cl_mem, clCreateBuffer, (cl_context, cl_mem_flags, size_t, void *, cl_int *)); DEF_PROC(cl_command_queue, clCreateCommandQueue, (cl_context, cl_device_id, cl_command_queue_properties, cl_int *)); DEF_PROC(cl_kernel, clCreateKernel, (cl_program, const char *, cl_int *)); diff --git a/src/private_opencl.h b/src/private_opencl.h index cc3fac7566..53888dc001 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -71,7 +71,7 @@ typedef struct _cl_ctx { cl_context ctx; cl_command_queue q; char *exts; - char *preamble; + char *options; } cl_ctx; /** @cond NEVER */ From 9a3eb8ad388e54db1e85d42d00f979701bca1224 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 8 Aug 2017 15:14:09 -0400 Subject: [PATCH 439/597] Change half to a struct. --- src/cluda_cuda.h | 9 ++++--- src/cluda_cuda.h.c | 56 +++++++++++++++++++++++--------------------- src/cluda_opencl.h | 9 ++++--- src/cluda_opencl.h.c | 56 +++++++++++++++++++++++--------------------- 4 files changed, 70 insertions(+), 60 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index 28bcdff13d..f2180e1970 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -35,11 +35,14 @@ #define ga_ulong unsigned long long #define ga_float float #define ga_double double -#define ga_half ga_ushort #define ga_size size_t #define ga_ssize ptrdiff_t -#define load_half(p) __half2float(*(p)) -#define store_half(p, v) (*(p) = __float2half_rn(v)) +#define load_half(p) __half2float((p)->data) +#define store_half(p, v) ((p)->data = __float2half_rn(v)) #define GA_DECL_SHARED_PARAM(type, name) #define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[]; #define GA_WARP_SIZE warpSize + +struct ga_half { + ga_ushort data; +}; diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 6eb55734aa..da03a43742 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -89,31 +89,33 @@ static const char cluda_cuda_h[] = { 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, -0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, -0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, -0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, -0x2a, 0x28, 0x70, 0x29, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, 0x76, 0x29, 0x20, 0x28, 0x2a, -0x28, 0x70, 0x29, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, -0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, -0x76, 0x29, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, -0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, -0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, +0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x5f, 0x74, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, +0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x70, 0x74, 0x72, 0x64, +0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, 0x70, 0x29, 0x2d, +0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, 0x76, 0x29, 0x20, 0x28, +0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, +0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x76, 0x29, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, +0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, +0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, +0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, +0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, +0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, +0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, -0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, -0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x65, 0x78, 0x74, -0x65, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, -0x64, 0x5f, 0x5f, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, -0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, -0x53, 0x49, 0x5a, 0x45, 0x20, 0x77, 0x61, 0x72, 0x70, 0x53, 0x69, -0x7a, 0x65, 0x0a, 0x00}; +0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, +0x77, 0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, +0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, +0x0a, 0x7d, 0x3b, 0x0a, 0x00}; diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 684872cce8..133fb275eb 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -30,11 +30,14 @@ #define ga_ulong ulong #define ga_float float #define ga_double double -#define ga_half half #define ga_size ulong #define ga_ssize long -#define load_half(p) vload_half(0, p) -#define store_half(p, v) vstore_half_rtn(v, 0, p) +#define load_half(p) vload_half(0, &(p)->data) +#define store_half(p, v) vstore_half_rtn(v, 0, &(p)->data) #define GA_DECL_SHARED_PARAM(type, name) , __local type *name #define GA_DECL_SHARED_BODY(type, name) #define GA_WARP_SIZE __GA_WARP_SIZE + +struct ga_half { + half data; +}; diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index a7aa914300..35bfe2f936 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -78,30 +78,32 @@ static const char cluda_opencl_h[] = { 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x68, 0x61, 0x6c, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, -0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x70, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, -0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, -0x76, 0x29, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x76, 0x2c, 0x20, -0x30, 0x2c, 0x20, 0x70, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, -0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, -0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, -0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, -0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, -0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, -0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, -0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, -0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, -0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, -0x0a, 0x00}; +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, +0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, +0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, +0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, 0x76, +0x29, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x76, 0x2c, 0x20, 0x30, +0x2c, 0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, +0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, +0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, +0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, +0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, +0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, +0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, +0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, +0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x73, +0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x00}; From aaa28b81ccf8676860fcaa05a64df79b57bd9318 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 11:38:29 -0400 Subject: [PATCH 440/597] Unifiy the API for atomics and provide for all types 32 bits and up along with half. --- src/cluda_cuda.h | 101 ++++ src/cluda_cuda.h.c | 375 +++++++++++++- src/cluda_opencl.h | 136 +++++ src/cluda_opencl.h.c | 851 ++++++++++++++++++++++++++++---- src/gpuarray_blas_cuda_cublas.c | 135 +++-- 5 files changed, 1435 insertions(+), 163 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index f2180e1970..afe87809c1 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -8,6 +8,7 @@ #undef NAN #endif #define NAN __int_as_float(0x7fffffff) +/* NULL */ #ifdef INFINITY #undef INFINITY #endif @@ -46,3 +47,103 @@ struct ga_half { ga_ushort data; }; + +#define gen_atom_add(name, argtype, wtype) \ + __device__ argtype name(argtype *addr, argtype val) { \ + union { \ + argtype a; \ + wtype w; \ + } p, n; \ + p.a = *addr; \ + do { \ + n.a = p.a + val; \ + p.w = atomicCAS((wtype *)addr, p.w, n.w); \ + } while (p.w != n.w); \ + return n.a; \ + } + +#define gen_atom32_add(name, argtype) gen_atom_add(name, argtype, unsigned int) +#define gen_atom64_add(name, argtype) gen_atom_add(name, argtype, unsigned long long) + +#define gen_atom_xchg(name, argtype, wtype) \ + __device__ argtype name(argtype *addr, argtype val) { \ + union { \ + argtype a; \ + wtype w; \ + } n, p; \ + n.a = val; \ + p.w = atomicExch((wtype *)addr, n.w); \ + return p.a; \ + } + +#define gen_atom32_xchg(name, argtype) gen_atom_xhg(name, argtype, unsigned int) +#define gen_atom64_xchg(name, argtype) gen_atom_xhg(name, argtype, unsigned long long) + +/* ga_int */ +#define atom_add_ig(a, b, c) atomicAdd(a, b, c) +#define atom_add_il(a, b, c) atomicAdd(a, b, c) +#define atom_xchg_ig(a, b) atomicExch(a, b) +#define atom_xchg_il(a, b) atomicExch(a, b) +/* ga_uint */ +#define atom_add_Ig(a, b, c) atomicAdd(a, b, c) +#define atom_add_Il(a, b, c) atomicAdd(a, b, c) +#define atom_xchg_Ig(a, b) atomicExch(a, b) +#define atom_xchg_Il(a, b) atomicExch(a, b) +/* ga_long */ +gen_atom64_add(atom_add_lg, ga_long) +#define atom_add_ll(a, b) atom_add_lg(a, b) +gen_atom64_xhg(atom_xchg_lg, ga_long) +#define atom_xchg_ll(a, b) atom_xchg_lg(a, b) +/* ga_ulong */ +#define atom_add_Lg(a, b, c) atomicAdd(a, b, c) +#define atom_add_Ll(a, b, c) atomicAdd(a, b, c) +#define atom_xchg_Lg(a, b) atomicExch(a, b) +#define atom_xchg_Ll(a, b) atomicExch(a, b) +/* ga_float */ +#define atom_add_fg(a, b, c) atomicAdd(a, b, c) +#define atom_add_fl(a, b, c) atomicAdd(a, b, c) +#define atom_xchg_fg(a, b) atomicExch(a, b) +#define atom_xchg_fl(a, b) atomicExch(a, b) +/* ga_double */ +#if __CUDA_ARCH__ < 600 +gen_atom64_add(atom_add_dg, ga_double) +#define atom_add_dl(a, b) atom_add_dg(a, b) +#else +#define atom_add_dg(a, b, c) atomicAdd(a, b, c) +#define atom_add_dl(a, b, c) atomicAdd(a, b, c) +#endif +gen_atom64_xchg(atom_xchg_dg, ga_double) +#define atom_xchg_dl(a, b) atom_xchg_dg(a, b) +#endif +/* ga_half */ +__device__ ga_half atom_add_hg(ga_half *addr, ga_half val) { + ga_uint *base = (ga_uint *)((ga_size)addr & ~2); + ga_uint old, assumed, sum, new_; + old = *base; + do { + assumed = old; + sum = __float2half_rn( + __half2float(val) + + __half2float((ga_half)__byte_perm(old, 0, + ((ga_size)addr & 2) ? 0x4432 : 0x4410))); + new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); + old = atomicCAS(base, assumed, new_); + } while (assumed != old); + return (ga_half)__byte_perm(old, 0, + ((ga_size)addr & 2) ? 0x4432 : 0x4410); +} +#define atom_add_hl(a, b) atom_add_hg(a, b) + +__device__ ga_half atom_xchg_hg(ga_half *addr, ga_half val) { + ga_uint *base = (ga_uint *)((ga_size)addr & ~2); + ga_uint old, assumed, new_; + old = *base; + do { + assumed = old; + new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254); + old = atomicCAS(base, assumed, new_); + } while (assumed != old); + return (ga_half)__byte_perm(old, 0, + ((ga_size)addr & 2) ? 0x4432 : 0x4410); +} +#define atom_xchg_hl(a, b) atom_xchg_hg(a, b) diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index da03a43742..5dfca8236f 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -25,6 +25,7 @@ static const char cluda_cuda_h[] = { 0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x29, +0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, @@ -118,4 +119,376 @@ static const char cluda_cuda_h[] = { 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, -0x0a, 0x7d, 0x3b, 0x0a, 0x00}; +0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, +0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, +0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, +0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x28, +0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x70, 0x2e, 0x77, 0x2c, 0x20, 0x6e, 0x2e, 0x77, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, +0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, +0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, +0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, +0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, +0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, +0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x29, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, +0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, +0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x6e, 0x2c, 0x20, 0x70, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, +0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, +0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, +0x63, 0x68, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, +0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x68, 0x67, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, +0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, +0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, +0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, +0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x36, 0x34, 0x5f, 0x78, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, +0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, +0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, +0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, +0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, +0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, +0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, +0x66, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, +0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, +0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, +0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, +0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, +0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, +0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, +0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, +0x66, 0x5f, 0x72, 0x6e, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5f, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x28, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x2b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x29, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, +0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, +0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, +0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x29, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, +0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, +0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, +0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, +0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, +0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, +0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, +0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, +0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, +0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, +0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, +0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, +0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, +0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, +0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, +0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2c, 0x20, +0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, +0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, +0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, +0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, +0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, +0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, 0x62, 0x79, +0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, +0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x00}; diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 133fb275eb..60d3b464f8 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -4,9 +4,11 @@ #define GLOBAL_MEM __global #define LOCAL_MEM __local #define LOCAL_MEM_ARG __local +/* NAN */ #ifndef NULL #define NULL ((void*)0) #endif +/* INFINITY */ #define LID_0 get_local_id(0) #define LID_1 get_local_id(1) #define LID_2 get_local_id(2) @@ -41,3 +43,137 @@ struct ga_half { half data; }; + +#pragma OPENCL_EXTENSION cl_khr_int64_base_atomics: enable + +#define gen_atom32_add(name, argtype, aspace) \ + argtype name(volatile aspace argtype *addr, argtype val) { \ + union { \ + argtype a; \ + int w; \ + } p, n; \ + int a; \ + p.a = *addr; \ + do { \ + a = p.w; \ + n.a = p.a + val; \ + p.w = atomic_cmpxhg((volatile aspace int *)addr, a, n.w); \ + } while (p.w != a); \ + return n.a; \ + } + +#define gen_atom64_add(name, argtype, aspace) \ + argtype name(volatile aspace argtype *addr, argtype val) { \ + union { \ + argtype a; \ + long w; \ + } p, n; \ + long a; \ + p.a = *addr; \ + do { \ + a = p.w; \ + n.a = p.a + val; \ + p.w = atom_cmpxhg((volatile aspace long *)addr, a, n.w); \ + } while (p.w != a); \ + return n.a; \ + } + +#define gen_atom64_xchg(name, argtype, aspace) \ + argtype name(volatile aspace argtype *addr, argtype val) { \ + union { \ + argtype a; \ + long w; \ + } p, n; \ + n.a = val; \ + p.w = atom_xchg((volatile aspace wtype *)addr, n.w); \ + return p.a; \ + } + +/* ga_int */ +#define atom_add_ig(a, b) atomic_add(a, b) +#define atom_add_il(a, b) atomic_add(a, b) +#define atom_xchg_ig(a, b) atomic_xchg(a, b) +#define atom_xchg_il(a, b) atomic_xchg(a, b) +/* ga_uint */ +#define atom_add_Ig(a, b) atomic_add(a, b) +#define atom_add_Il(a, b) atomic_add(a, b) +#define atom_xchg_Ig(a, b) atomic_xchg(a, b) +#define atom_xchg_Il(a, b) atomic_xchg(a, b) +/* ga_long */ +#define atom_add_lg(a, b) atom_add(a, b) +#define atom_add_ll(a, b) atom_add(a, b) +#define atom_xchg_lg(a, b) atom_xchg(a, b) +#define atom_xchg_ll(a, b) atom_xchg(a, b) +/* ga_ulong */ +#define atom_add_Lg(a, b) atom_add(a, b) +#define atom_add_Ll(a, b) atom_add(a, b) +#define atom_xchg_Lg(a, b) atom_xchg(a, b) +#define atom_xchg_Ll(a, b) atom_xchg(a, b) +/* ga_float */ +gen_atom32_add(atom_add_fg, ga_float, global) +gen_atom32_add(atom_add_fl, ga_float, local) +#define atom_xchg_fg(a, b) atomic_xchg(a, b) +#define atom_xchg_fl(a, b) atomic_xchg(a, b) +/* ga_double */ +gen_atom64_add(atom_add_dg, ga_double, global) +gen_atom64_add(atom_add_dl, ga_double, local) +gen_atom64_xchg(atom_xchg_dg, ga_double, global) +gen_atom64_xchg(atom_xchg_dl, ga_double, local) +/* ga_half */ +#define gen_atomh_add(name, aspace) \ + ga_half name(volatile aspace ga_half *addr, ga_half val) { \ + ga_size off = (ga_size)addr & 2; \ + volatile aspace int *base = (volatile aspace int *)((ga_size)addr - off); \ + int o, a, n; \ + float fo; \ + float fval; \ + ga_half hn; \ + fval = vload_half(0, &val->data); \ + o = *base; \ + do { \ + a = o; \ + /* This loads the half of `o` that we want to update */ \ + fo = vload_half(off, (__private half *)&o); \ + /* We compute the half addition in float 32 */ \ + store_half(fval + fo, &hn); \ + /* Now we reassemble the the parts to form a 32-bits n */ \ + if (off == 2) \ + n = (int)hn->data << 16 & (o & 0xffff); \ + else \ + n = (int)hn->data & (o & 0xffff0000); \ + o = atomic_cmpxchg(base, a, n); \ + } while (o != a); \ + if (off == 2) \ + hn->data = (ushort)(o >> 16); \ + else \ + hn->data = (ushort)(o & 0xffff); \ + return hn; \ + } + +#define gen_atomh_xchg(name, aspace) \ + ga_half name(volatile aspace ga_half *addr, ga_half *val) { \ + ga_size off = (ga_size)addr & 2; \ + volatile aspace int *base = (volatile aspace int *)((ga_size)addr - off); \ + int o, a, n; \ + ga_half hr; \ + o = *base; \ + do { \ + a = o; \ + /* we have to combine our half value with the right part of `o` */ \ + if (off == 2) \ + n = (int)val->data << 16 & (o & 0xffff); \ + else \ + n = (int)val->data & (o & 0xffff0000); \ + o = atomic_cmpxchg(base, a, n); \ + } while (o != a); \ + if (off == 2) \ + hr->data = (ushort)o << 16; \ + else \ + hr->data = (ushort)o & 0xffff; \ + return hr; \ + } + +gen_atomh_add(atom_add_hg, global) +gen_atomh_add(atom_add_hl, local) +gen_atomh_xchg(atom_xchg_hg, global) +gen_atomh_xchg(atom_xchg_hl, local) diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index 35bfe2f936..1328e619a8 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -16,94 +16,773 @@ static const char cluda_opencl_h[] = { 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, -0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x23, 0x69, 0x66, -0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x0a, 0x20, -0x20, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x55, -0x4c, 0x4c, 0x20, 0x28, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x2a, 0x29, -0x30, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, +0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x2f, 0x2a, 0x20, +0x4e, 0x41, 0x4e, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x6e, +0x64, 0x65, 0x66, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x0a, 0x20, 0x20, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x55, 0x4c, +0x4c, 0x20, 0x28, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x2a, 0x29, 0x30, +0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, +0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, +0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, -0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, -0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x67, +0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, +0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, +0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, +0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, +0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, -0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x32, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, -0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x30, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, -0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x31, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, -0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, 0x32, 0x29, +0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, +0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, -0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, -0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, +0x44, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, +0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, -0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, -0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, -0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, -0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, -0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, -0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, -0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, -0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x28, -0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, 0x63, 0x68, -0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x63, 0x68, 0x61, -0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x63, 0x68, -0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, -0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, -0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, -0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x69, -0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6c, 0x6f, -0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, -0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, -0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, -0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, 0x76, -0x29, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x76, 0x2c, 0x20, 0x30, -0x2c, 0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, -0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, -0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, -0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, -0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, -0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, -0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, -0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, -0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, -0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x73, -0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x00}; +0x69, 0x64, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, +0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, +0x70, 0x73, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, +0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, +0x70, 0x73, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, +0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, +0x70, 0x73, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, +0x75, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, +0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, +0x75, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, +0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, +0x72, 0x74, 0x20, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, +0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x75, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, +0x73, 0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, +0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, +0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x73, +0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, +0x2c, 0x20, 0x76, 0x29, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x76, +0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, +0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, +0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, +0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, +0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, +0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, +0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, +0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, +0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, +0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, +0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, +0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, +0x3b, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, +0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, +0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, +0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, +0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, +0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, +0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, +0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, +0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x68, 0x67, +0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, +0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, +0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, +0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, +0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, +0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x68, 0x67, 0x28, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, +0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, +0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, +0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, +0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, +0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, +0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, +0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, +0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, +0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, +0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, +0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, +0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, +0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, +0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, +0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, +0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6f, +0x66, 0x66, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, +0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, +0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, +0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, +0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x2d, +0x20, 0x6f, 0x66, 0x66, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, +0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x68, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, 0x76, 0x6c, 0x6f, +0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, +0x26, 0x76, 0x61, 0x6c, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x20, +0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x6f, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x2f, 0x2a, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x6c, +0x6f, 0x61, 0x64, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x6f, 0x66, 0x20, 0x60, 0x6f, 0x60, 0x20, 0x74, +0x68, 0x61, 0x74, 0x20, 0x77, 0x65, 0x20, 0x77, 0x61, 0x6e, 0x74, +0x20, 0x74, 0x6f, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, +0x2a, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, +0x6f, 0x20, 0x3d, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x6f, 0x66, 0x66, 0x2c, 0x20, 0x28, 0x5f, +0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x2a, 0x29, 0x26, 0x6f, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x57, +0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x20, 0x74, +0x68, 0x65, 0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x64, 0x64, +0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x20, 0x33, 0x32, 0x20, 0x2a, 0x2f, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, +0x66, 0x6f, 0x2c, 0x20, 0x26, 0x68, 0x6e, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x2f, 0x2a, 0x20, 0x4e, 0x6f, 0x77, 0x20, 0x77, 0x65, 0x20, +0x72, 0x65, 0x61, 0x73, 0x73, 0x65, 0x6d, 0x62, 0x6c, 0x65, 0x20, +0x74, 0x68, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x61, 0x72, +0x74, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, +0x61, 0x20, 0x33, 0x32, 0x2d, 0x62, 0x69, 0x74, 0x73, 0x20, 0x6e, +0x20, 0x2a, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, +0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x29, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x20, 0x3d, +0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, 0x68, 0x6e, 0x2d, 0x3e, 0x64, +0x61, 0x74, 0x61, 0x20, 0x3c, 0x3c, 0x20, 0x31, 0x36, 0x20, 0x26, +0x20, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, 0x66, 0x66, +0x66, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, +0x68, 0x6e, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x26, 0x20, +0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, 0x66, 0x66, 0x66, +0x30, 0x30, 0x30, 0x30, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x20, 0x3d, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, +0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, +0x61, 0x2c, 0x20, 0x6e, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, +0x28, 0x6f, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x69, 0x66, 0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x3d, 0x20, +0x32, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, +0x6e, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x28, +0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x29, 0x28, 0x6f, 0x20, 0x3e, +0x3e, 0x20, 0x31, 0x36, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x68, 0x6e, 0x2d, 0x3e, 0x64, 0x61, 0x74, +0x61, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, +0x29, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, 0x66, 0x66, +0x66, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x6e, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x2a, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x20, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, +0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, +0x26, 0x20, 0x32, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x2d, 0x20, 0x6f, 0x66, 0x66, 0x29, 0x3b, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x2c, +0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, +0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, +0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x6f, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, +0x76, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x63, 0x6f, 0x6d, 0x62, 0x69, +0x6e, 0x65, 0x20, 0x6f, 0x75, 0x72, 0x20, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x77, 0x69, 0x74, 0x68, +0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, +0x70, 0x61, 0x72, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x60, 0x6f, 0x60, +0x20, 0x2a, 0x2f, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x69, 0x66, 0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x3d, +0x20, 0x32, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6e, 0x20, 0x3d, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, 0x76, 0x61, +0x6c, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3c, 0x3c, 0x20, +0x31, 0x36, 0x20, 0x26, 0x20, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, +0x78, 0x66, 0x66, 0x66, 0x66, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x69, +0x6e, 0x74, 0x29, 0x76, 0x61, 0x6c, 0x2d, 0x3e, 0x64, 0x61, 0x74, +0x61, 0x20, 0x26, 0x20, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, +0x66, 0x66, 0x66, 0x66, 0x30, 0x30, 0x30, 0x30, 0x29, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6f, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, +0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, +0x65, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, +0x6c, 0x65, 0x20, 0x28, 0x6f, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, +0x3d, 0x3d, 0x20, 0x32, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x68, 0x72, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, +0x3d, 0x20, 0x28, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x29, 0x6f, +0x20, 0x3c, 0x3c, 0x20, 0x31, 0x36, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, +0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x72, 0x2d, 0x3e, +0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x73, 0x68, +0x6f, 0x72, 0x74, 0x29, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, +0x66, 0x66, 0x66, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x72, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x68, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, +0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, +0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x2c, 0x20, 0x67, +0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, +0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x00}; diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 4bf0f4762a..7a7c593ebf 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -77,10 +77,11 @@ typedef struct _blas_handle { #define LARGE_VAL(v) (v >= INT_MAX) static const char *code_sgemvBH_N_a1_b1_small = \ - "extern \"C\"__global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], size_t incx, " \ - " float *y[], size_t incy, " \ - " size_t b, size_t m, size_t n) {" \ + "#include \n" \ + "KERNEL void sgemv(const float *A[], size_t lda, " \ + " const float *x[], size_t incx, " \ + " float *y[], size_t incy, " \ + " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ " for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < m;" \ @@ -94,51 +95,37 @@ static const char *code_sgemvBH_N_a1_b1_small = \ " Ap += lda;" \ " xp += incx;" \ " }" \ - " atomicAdd(&y[p][i*incy], yi);" \ + " atom_add_fg(&y[p][i*incy], yi);" \ " }" \ " }" \ "}\n"; -static const char *code_sgemvBH_T_a1_b1_small = \ - "extern \"C\" __global__ void sgemv(const float *A[], size_t lda, " \ - " const float *x[], size_t incx, " \ - " float *y[], size_t incy, " \ - " size_t b, size_t m, size_t n) {" \ - " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ - " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ - " if (i >= m || p >= b) return;" \ - " float yi = 0.0f;" \ - " const float *Ap = A[p] + i * lda;" \ - " const float *xp = x[p];\n" \ - " # pragma unroll 32\n" \ - " for (size_t j = 0; j < n; j++) {" \ - " yi += Ap[j] * xp[0];" \ - " xp += incx;" \ - " }" \ - " atomicAdd(&y[p][i*incy], yi);" \ +static const char *code_sgemvBH_T_a1_b1_small = \ + "#include \n" \ + "KERNEL void sgemv(const float *A[], size_t lda, " \ + " const float *x[], size_t incx, " \ + " float *y[], size_t incy, " \ + " size_t b, size_t m, size_t n) {" \ + " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ + " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ + " if (i >= m || p >= b) return;" \ + " float yi = 0.0f;" \ + " const float *Ap = A[p] + i * lda;" \ + " const float *xp = x[p];\n" \ + " # pragma unroll 32\n" \ + " for (size_t j = 0; j < n; j++) {" \ + " yi += Ap[j] * xp[0];" \ + " xp += incx;" \ + " }" \ + " atom_add_fg(&y[p][i*incy], yi);" \ "}\n"; -static const char *atomicadd_double = \ - "#if __CUDA_ARCH__ < 600\n" \ - "__device__ double atomicAdd(double* address, double val) {" \ - " unsigned long long int* address_as_ull =" \ - " (unsigned long long int*)address;" \ - " unsigned long long int old = *address_as_ull, assumed;" \ - " do {" \ - " assumed = old;" \ - " old = atomicCAS(address_as_ull, assumed," \ - " __double_as_longlong(val +" \ - " __longlong_as_double(assumed)));" \ - " } while (assumed != old);" \ - " return __longlong_as_double(old);" \ - "}\n" \ - "#endif\n"; - static const char *code_dgemvBH_N_a1_b1_small = \ - "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], size_t incx, " \ - " double *y[], size_t incy, " \ - " size_t b, size_t m, size_t n) {" \ + "#include \n" \ + "KERNEL void dgemv(const double *A[], size_t lda, " \ + " const double *x[], size_t incx, " \ + " double *y[], size_t incy, " \ + " size_t b, size_t m, size_t n) {" \ " for (size_t p = blockIdx.y * blockDim.y + threadIdx.y; p < b;" \ " p += gridDim.y * blockDim.y) {" \ " for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < m;" \ @@ -152,32 +139,34 @@ static const char *code_dgemvBH_N_a1_b1_small = \ " Ap += lda;" \ " xp += incx;" \ " }" \ - " atomicAdd(&y[p][i*incy], yi);" \ + " atom_add_dg(&y[p][i*incy], yi);" \ " }" \ " }" \ "}\n"; -static const char *code_dgemvBH_T_a1_b1_small = \ - "extern \"C\" __global__ void dgemv(const double *A[], size_t lda, " \ - " const double *x[], size_t incx, " \ - " double *y[], size_t incy, " \ - " size_t b, size_t m, size_t n) {" \ - " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ - " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ - " if (i >= m || p >= b) return;" \ - " double yi = 0.0;" \ - " const double *Ap = A[p] + i * lda;" \ - " const double *xp = x[p];\n" \ - " # pragma unroll 32\n" \ - " for (size_t j = 0; j < n; j++) {" \ - " yi += Ap[j] * xp[0];" \ - " xp += incx;" \ - " }" \ - " atomicAdd(&y[p][i*incy], yi);" \ +static const char *code_dgemvBH_T_a1_b1_small = \ + "#include \n" \ + "KERNEL void dgemv(const double *A[], size_t lda, " \ + " const double *x[], size_t incx, " \ + " double *y[], size_t incy, " \ + " size_t b, size_t m, size_t n) {" \ + " size_t i = blockIdx.x * blockDim.x + threadIdx.x;" \ + " size_t p = blockIdx.y * blockDim.y + threadIdx.y;" \ + " if (i >= m || p >= b) return;" \ + " double yi = 0.0;" \ + " const double *Ap = A[p] + i * lda;" \ + " const double *xp = x[p];\n" \ + " # pragma unroll 32\n" \ + " for (size_t j = 0; j < n; j++) {" \ + " yi += Ap[j] * xp[0];" \ + " xp += incx;" \ + " }" \ + " atom_add_dg(&y[p][i*incy], yi);" \ "}\n"; static const char *code_sgerBH_gen_small = \ - "extern \"C\" __global__ void _sgerBH_gen_small(" \ + "#include \n" \ + "KERNEL void _sgerBH_gen_small(" \ " const float *x[], size_t incx," \ " const float *y[], size_t incy," \ " float alpha, float *A[], size_t lda," \ @@ -186,13 +175,14 @@ static const char *code_sgerBH_gen_small = \ " size_t j = blockIdx.y * blockDim.y + threadIdx.y;" \ " if (i >= m || j >= n) return;" \ " for (size_t p = blockIdx.z; p < b; p += gridDim.z) {" \ - " atomicAdd(&A[p][j * lda + i]," \ - " alpha * x[p][i * incx] * y[p][j * incy]);" \ + " atom_add_fg(&A[p][j * lda + i]," \ + " alpha * x[p][i * incx] * y[p][j * incy]);" \ " }" \ "}\n"; static const char *code_dgerBH_gen_small = \ - "extern \"C\" __global__ void _dgerBH_gen_small(" \ + "#include \n" \ + "KERNEL void _dgerBH_gen_small(" \ " const double *x[], size_t incx, " \ " const double *y[], size_t incy," \ " double alpha, double *A[], size_t lda," \ @@ -201,15 +191,14 @@ static const char *code_dgerBH_gen_small = \ " size_t j = blockIdx.y * blockDim.y + threadIdx.y;" \ " if (i >= m || j >= n) return;" \ " for (size_t p = blockIdx.z; p < b; p += gridDim.z) {" \ - " atomicAdd(&A[p][j * lda + i]," \ - " alpha * x[p][i * incx] * y[p][j * incy]);" \ + " atom_add_dg(&A[p][j * lda + i]," \ + " alpha * x[p][i * incx] * y[p][j * incy]);" \ " }" \ "}\n"; static int setup(gpucontext *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle; - const char *tmp[2]; cublasStatus_t err; int types[10]; int e; @@ -254,13 +243,9 @@ static int setup(gpucontext *c) { if (e != GA_NO_ERROR) goto e1; e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, c, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e2; - tmp[0] = atomicadd_double; - tmp[1] = code_dgemvBH_N_a1_b1_small; - e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, c, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); + e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, c, 1, &code_dgemvBH_N_a1_b1_small, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e3; - tmp[0] = atomicadd_double; - tmp[1] = code_dgemvBH_T_a1_b1_small; - e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, c, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); + e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, c, 1, &code_dgemvBH_T_a1_b1_small, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e4; types[0] = GA_BUFFER; @@ -276,9 +261,7 @@ static int setup(gpucontext *c) { e = GpuKernel_init(&handle->sgerBH_gen_small, c, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL); if (e != GA_NO_ERROR) goto e5; types[4] = GA_DOUBLE; - tmp[0] = atomicadd_double; - tmp[1] = code_dgerBH_gen_small; - e = GpuKernel_init(&handle->dgerBH_gen_small, c, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL); + e = GpuKernel_init(&handle->dgerBH_gen_small, c, 1, &code_dgerBH_gen_small, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e6; ctx->blas_handle = handle; From dfa58f665b376797833dac812adfacf6912b3cca Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 8 Aug 2017 17:47:55 -0400 Subject: [PATCH 441/597] Fix some problems with cuda. --- src/cluda_cuda.h | 10 +- src/cluda_cuda.h.c | 826 ++++++++++++++++++------------------- src/gpuarray_buffer_cuda.c | 3 +- 3 files changed, 420 insertions(+), 419 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index afe87809c1..6faaec563e 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -1,6 +1,6 @@ #define local_barrier() __syncthreads() -#define WITHIN_KERNEL extern \"C\" __device__ -#define KERNEL extern \"C\" __global__ +#define WITHIN_KERNEL extern "C" __device__ +#define KERNEL extern "C" __global__ #define GLOBAL_MEM /* empty */ #define LOCAL_MEM __shared__ #define LOCAL_MEM_ARG /* empty */ @@ -76,8 +76,8 @@ struct ga_half { return p.a; \ } -#define gen_atom32_xchg(name, argtype) gen_atom_xhg(name, argtype, unsigned int) -#define gen_atom64_xchg(name, argtype) gen_atom_xhg(name, argtype, unsigned long long) +#define gen_atom32_xchg(name, argtype) gen_atom_xchg(name, argtype, unsigned int) +#define gen_atom64_xchg(name, argtype) gen_atom_xchg(name, argtype, unsigned long long) /* ga_int */ #define atom_add_ig(a, b, c) atomicAdd(a, b, c) @@ -92,7 +92,7 @@ struct ga_half { /* ga_long */ gen_atom64_add(atom_add_lg, ga_long) #define atom_add_ll(a, b) atom_add_lg(a, b) -gen_atom64_xhg(atom_xchg_lg, ga_long) +gen_atom64_xchg(atom_xchg_lg, ga_long) #define atom_xchg_ll(a, b) atom_xchg_lg(a, b) /* ga_ulong */ #define atom_add_Lg(a, b, c) atomicAdd(a, b, c) diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 5dfca8236f..2fc6c2b99e 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -5,490 +5,490 @@ static const char cluda_cuda_h[] = { 0x65, 0x61, 0x64, 0x73, 0x28, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, 0x5f, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, -0x72, 0x6e, 0x20, 0x5c, 0x22, 0x43, 0x5c, 0x22, 0x20, 0x5f, 0x5f, -0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, -0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5c, 0x22, -0x43, 0x5c, 0x22, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, -0x6c, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x47, 0x4c, 0x4f, 0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, -0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, -0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, -0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, -0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, +0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, +0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, +0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x5f, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f, +0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x2f, 0x2a, 0x20, +0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, -0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, -0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, -0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, -0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, -0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, 0x5f, 0x5f, 0x69, 0x6e, -0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, -0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x29, -0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x20, 0x2a, 0x2f, -0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, -0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, -0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, -0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, -0x59, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x38, -0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x74, -0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, +0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, +0x65, 0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, +0x5f, 0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, +0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, +0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, +0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, +0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, +0x41, 0x4e, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, +0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, +0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x4e, 0x55, 0x4c, 0x4c, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, +0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, +0x59, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, +0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x65, 0x6e, 0x64, +0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x5f, 0x5f, +0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x38, 0x30, 0x30, 0x30, 0x30, +0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x4c, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, +0x64, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x74, +0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, -0x5f, 0x31, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, -0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x74, 0x68, 0x72, 0x65, -0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, -0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x78, +0x5f, 0x32, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, +0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, +0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, +0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, -0x49, 0x4d, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, -0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x62, 0x6c, -0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, -0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x78, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, -0x44, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, -0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, -0x6b, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, -0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, -0x5f, 0x31, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, -0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, -0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, +0x49, 0x4d, 0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, +0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, +0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, +0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, +0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, +0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x72, 0x69, 0x64, +0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, +0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, +0x32, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x7a, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, +0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, +0x74, 0x65, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, +0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x73, 0x69, 0x67, 0x6e, -0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, -0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, -0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, -0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, -0x72, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, -0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, -0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x6e, -0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x0a, +0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, +0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x73, 0x68, 0x6f, +0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, -0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, +0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x5f, 0x74, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x70, 0x74, 0x72, 0x64, -0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, 0x70, 0x29, 0x2d, -0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x70, 0x2c, 0x20, 0x76, 0x29, 0x20, 0x28, -0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, -0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x76, 0x29, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, -0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, -0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, -0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, -0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, -0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, -0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, -0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, -0x77, 0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, -0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, -0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, -0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, -0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, -0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x20, 0x73, 0x69, 0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, +0x7a, 0x65, 0x20, 0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, +0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, +0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, +0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x28, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, +0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, +0x70, 0x2c, 0x20, 0x76, 0x29, 0x20, 0x28, 0x28, 0x70, 0x29, 0x2d, +0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, +0x6e, 0x28, 0x76, 0x29, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, +0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, +0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, +0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, +0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x65, +0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, +0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, +0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x77, 0x61, 0x72, 0x70, +0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, +0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, +0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, +0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, +0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, +0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, -0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x28, -0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x70, 0x2e, 0x77, 0x2c, 0x20, 0x6e, 0x2e, 0x77, -0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, -0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, -0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, +0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, +0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, -0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, +0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x70, +0x2e, 0x77, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x2e, 0x77, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, -0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, -0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, -0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x29, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, -0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, -0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, -0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, +0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, +0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, +0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, +0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, +0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, +0x6e, 0x2c, 0x20, 0x70, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x6e, 0x2c, 0x20, 0x70, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, -0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, +0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, -0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, -0x63, 0x68, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, -0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, +0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x68, 0x67, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, -0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, -0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, -0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, -0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, +0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, +0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, +0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, +0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, +0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, -0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x36, 0x34, 0x5f, 0x78, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, -0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, +0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, +0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, +0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, +0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, +0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, +0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, +0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, -0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, -0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, -0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, +0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, -0x66, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, -0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, -0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, -0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, -0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, -0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, -0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, -0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, -0x66, 0x5f, 0x72, 0x6e, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5f, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x28, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x2b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, -0x61, 0x74, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x29, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, -0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, -0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, -0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x29, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, -0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, -0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, -0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, -0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, -0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, -0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, -0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, -0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, -0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, -0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, -0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, -0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, -0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, -0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, +0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, +0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, +0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, +0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, +0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, +0x5f, 0x72, 0x6e, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, +0x76, 0x61, 0x6c, 0x29, 0x20, 0x2b, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, -0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2c, 0x20, -0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, -0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, -0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, -0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, -0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, -0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, 0x62, 0x79, -0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, -0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x00}; +0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, +0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, +0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x29, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, +0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, +0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, +0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, +0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, +0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, 0x62, +0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, +0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, +0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, +0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x7d, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, +0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, +0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, +0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, +0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, +0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, +0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, +0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, +0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2c, 0x20, 0x28, +0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, +0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, +0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, +0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, +0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, +0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, 0x62, 0x79, 0x74, +0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, +0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, +0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, +0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x00}; diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 6d3dd24c67..eade62f8cd 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -322,7 +322,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p) { } res->errbuf->flags |= CUDA_MAPPED_PTR; /* Prime the cache */ - cuda_alloc((gpucontext *)res, p->initial_cache_size, NULL, 0); + if (p->initial_cache_size) + cuda_alloc((gpucontext *)res, p->initial_cache_size, NULL, 0); return res; fail_end: cuMemFreeHost(pp); From 47315c0098f90e6e5934f72b4fec849086c3eb03 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 16:35:30 -0400 Subject: [PATCH 442/597] Don't use size arrays. --- tests/check_array.c | 6 +++--- tests/check_reduction.c | 32 ++++++++++++++++---------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/check_array.c b/tests/check_array.c index f309f9b50b..55da35e53f 100644 --- a/tests/check_array.c +++ b/tests/check_array.c @@ -27,20 +27,20 @@ START_TEST(test_take1_ok) { 18, 19, 20, 21, 22, 23}; uint32_t buf[12 * 24]; const size_t data_dims[1] = {24}; - ssize_t indexes[12]; + long indexes[12]; size_t dims[3]; ga_assert_ok(GpuArray_empty(&base, ctx, GA_UINT, 1, data_dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&base, data, sizeof(data))); dims[0] = 12; - ga_assert_ok(GpuArray_empty(&idx, ctx, GA_SSIZE, 1, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&idx, ctx, GA_LONG, 1, dims, GA_C_ORDER)); dims[1] = 6; ga_assert_ok(GpuArray_empty(&res, ctx, GA_UINT, 2, dims, GA_C_ORDER)); /* test v[[1, 0]] on 1d (4) */ indexes[0] = 1; indexes[1] = 0; - ga_assert_ok(GpuArray_write(&idx, indexes, sizeof(ssize_t) * 2)); + ga_assert_ok(GpuArray_write(&idx, indexes, sizeof(long) * 2)); ga_assert_ok(GpuArray_view(&v, &base)); ga_assert_ok(GpuArray_view(&vidx, &idx)); diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 8844a585c8..ca3f231bf4 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -81,9 +81,9 @@ START_TEST(test_reduction){ size_t prodDims = dims[0]*dims[1]*dims[2]; const unsigned reduxList[] = {0,2}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMax = calloc(1, sizeof(*pMax) * dims[1] ); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) * dims[1] ); + float *pSrc = calloc(sizeof(*pSrc), prodDims); + float *pMax = calloc(sizeof(*pMax), dims[1]); + unsigned long *pArgmax = calloc(sizeof(*pArgmax), dims[1]); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); @@ -106,7 +106,7 @@ START_TEST(test_reduction){ ga_assert_ok(GpuArray_empty(&gaSrc, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaMax, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -171,9 +171,9 @@ START_TEST(test_idxtranspose){ size_t rdxProdDims = rdxDims[0]; const unsigned reduxList[] = {2,0}; - float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); - float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); + float *pSrc = calloc(sizeof(*pSrc), prodDims); + float *pMax = calloc(sizeof(*pMax), rdxProdDims); + unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); @@ -196,7 +196,7 @@ START_TEST(test_idxtranspose){ ga_assert_ok(GpuArray_empty(&gaSrc, ctx, GA_FLOAT, 3, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaMax, ctx, GA_FLOAT, 1, rdxDims, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE, 1, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG, 1, rdxDims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -259,9 +259,9 @@ START_TEST(test_veryhighrank){ size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const unsigned reduxList[] = {2,4,7,5}; - float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); - float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); + float *pSrc = calloc(sizeof(*pSrc), prodDims); + float *pMax = calloc(sizeof(*pMax), rdxProdDims); + unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); @@ -284,7 +284,7 @@ START_TEST(test_veryhighrank){ ga_assert_ok(GpuArray_empty(&gaSrc, ctx, GA_FLOAT, 8, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaMax, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE, 4, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -358,9 +358,9 @@ START_TEST(test_alldimsreduced){ size_t gtArgmax; float gtMax; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMax = calloc(1, sizeof(*pMax) ); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) ); + float *pSrc = calloc(sizeof(*pSrc), prodDims); + float *pMax = calloc(1, sizeof(*pMax)); + unsigned long *pArgmax = calloc(1, sizeof(*pArgmax)); ck_assert_ptr_ne(pSrc, NULL); ck_assert_ptr_ne(pMax, NULL); @@ -383,7 +383,7 @@ START_TEST(test_alldimsreduced){ ga_assert_ok(GpuArray_empty(&gaSrc, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaMax, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE, 0, NULL, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG, 0, NULL, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ From 28845b79e4acc8a9777dfa01a18fc1055ed30c7a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 17:08:18 -0400 Subject: [PATCH 443/597] Make the C versions of the cluda headers rebuild automatically. --- src/CMakeLists.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d9ddc97ee8..b687e5da1a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,6 +14,22 @@ add_custom_command( WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/gen_types.py) +add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h.c + COMMAND python head.py cluda_cuda.h + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/head.py + ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h + ) + +add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h.c + COMMAND python head.py cluda_opencl.h + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/head.py + ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h + ) + macro (set_rel var) file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}") # clear previous list (if any) @@ -57,6 +73,9 @@ gpuarray_blas_opencl_clblas.c gpuarray_blas_opencl_clblast.c ) +set_property(SOURCE gpuarray_buffer_cuda.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cluda_cuda.h.c) +set_property(SOURCE gpuarray_buffer_opencl.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cluda_opencl.h.c) + check_function_exists(strlcat HAVE_STRL) check_function_exists(mkstemp HAVE_MKSTEMP) From 2cada04401bad1e63f93acae855748630981c5bf Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 17:16:32 -0400 Subject: [PATCH 444/597] Fix float16 atomics. --- src/cluda_cuda.h | 19 +++-- src/cluda_cuda.h.c | 194 +++++++++++++++++++++------------------------ 2 files changed, 101 insertions(+), 112 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index 6faaec563e..a4222c38d6 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -114,36 +114,35 @@ gen_atom64_add(atom_add_dg, ga_double) #endif gen_atom64_xchg(atom_xchg_dg, ga_double) #define atom_xchg_dl(a, b) atom_xchg_dg(a, b) -#endif /* ga_half */ __device__ ga_half atom_add_hg(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, sum, new_; + ga_half tmp; old = *base; do { assumed = old; - sum = __float2half_rn( - __half2float(val) + - __half2float((ga_half)__byte_perm(old, 0, - ((ga_size)addr & 2) ? 0x4432 : 0x4410))); + tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); + sum = __float2half_rn(__half2float(val.data) + __half2float(tmp.data)); new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); - return (ga_half)__byte_perm(old, 0, - ((ga_size)addr & 2) ? 0x4432 : 0x4410); + tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); + return tmp; } #define atom_add_hl(a, b) atom_add_hg(a, b) __device__ ga_half atom_xchg_hg(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, new_; + ga_half tmp; old = *base; do { assumed = old; - new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254); + new_ = __byte_perm(old, val.data, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); - return (ga_half)__byte_perm(old, 0, - ((ga_size)addr & 2) ? 0x4432 : 0x4410); + tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); + return tmp; } #define atom_xchg_hl(a, b) atom_xchg_hg(a, b) diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 2fc6c2b99e..2421a5b9ce 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -379,77 +379,10 @@ static const char cluda_cuda_h[] = { 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, -0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, -0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, -0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, -0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, -0x5f, 0x72, 0x6e, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, -0x76, 0x61, 0x6c, 0x29, 0x20, 0x2b, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, -0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, -0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, -0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, -0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, -0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x29, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, -0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, -0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, -0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, -0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, -0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, 0x62, -0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, -0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, -0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, -0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x7d, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, -0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, +0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, @@ -459,36 +392,93 @@ static const char cluda_cuda_h[] = { 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, -0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, -0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, -0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, -0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, -0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, -0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2c, 0x20, 0x28, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, +0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, +0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, +0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, -0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, -0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, -0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, -0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x29, 0x5f, 0x5f, 0x62, 0x79, 0x74, -0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, -0x20, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, -0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, -0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x00}; +0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, +0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, +0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, +0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, +0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, +0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, +0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, +0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, +0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, +0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, +0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, +0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, +0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, +0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, +0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, +0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, +0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, +0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, +0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, +0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, +0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, +0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, +0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, +0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, +0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, +0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, +0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, +0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, +0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x00}; From 472726efab50a825df47252911052c0fb90e98fb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 18:15:17 -0400 Subject: [PATCH 445/597] Add missing error message. --- src/gpuarray_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 7ecea86bf6..9d1aecb13c 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -571,7 +571,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, if (check_error && err == GA_NO_ERROR) { err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); if (err == GA_NO_ERROR && kerr != 0) { - err = GA_VALUE_ERROR; + err = error_set(ctx->err, GA_VALUE_ERROR, "Index out of bounds"); kerr = 0; /* We suppose this will not fail */ gpudata_write(errbuf, 0, &kerr, sizeof(int)); From e769d68bf4c8231f4067753f3bb73b19b9f8b9a4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 18:17:59 -0400 Subject: [PATCH 446/597] Make the tests safer by fixing the flags. --- tests/check_array.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/check_array.c b/tests/check_array.c index 55da35e53f..3be4904784 100644 --- a/tests/check_array.c +++ b/tests/check_array.c @@ -47,12 +47,15 @@ START_TEST(test_take1_ok) { ga_assert_ok(GpuArray_view(&vres, &res)); v.dimensions[0] = 4; + GpuArray_fix_flags(&v); vidx.dimensions[0] = 2; + GpuArray_fix_flags(&vidx); vres.nd = 1; vres.dimensions[0] = vidx.dimensions[0]; vres.strides[0] = v.strides[0]; + GpuArray_fix_flags(&vres); ga_assert_ok(GpuArray_take1(&vres, &v, &vidx, 0)); ga_assert_ok(GpuArray_read(buf, sizeof(uint32_t) * 2, &vres)); @@ -75,18 +78,21 @@ START_TEST(test_take1_ok) { ga_assert_ok(GpuArray_view(&vres, &res)); vidx.dimensions[0] = 3; + GpuArray_fix_flags(&vidx); dims[0] = 4; dims[1] = 6; ga_assert_ok(GpuArray_reshape_inplace(&v, 2, dims, GA_ANY_ORDER)); v.dimensions[1] = 5; v.strides[0] = v.dimensions[1] * v.strides[1]; + GpuArray_fix_flags(&v); dims[0] = 3; dims[1] = 24; ga_assert_ok(GpuArray_reshape_inplace(&vres, 2, dims, GA_C_ORDER)); vres.dimensions[1] = v.dimensions[1]; vres.strides[0] = v.strides[0]; + GpuArray_fix_flags(&vres); ga_assert_ok(GpuArray_take1(&vres, &v, &vidx, 0)); ga_assert_ok(GpuArray_read(buf, sizeof(uint32_t) * 15, &vres)); @@ -263,6 +269,7 @@ START_TEST(test_take1_offset) { /* Fake subtensor for offset */ i.offset = 8; i.dimensions[0] = 2; + GpuArray_fix_flags(&i); ga_assert_ok(GpuArray_take1(&r, &v, &i, 1)); /* The actual results are not important, this is just to check that From 60cae6b9c553102f62c8e423b565042ec8a92c77 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 18:28:31 -0400 Subject: [PATCH 447/597] Fix test for pre-offseted arrays. --- tests/check_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/check_array.c b/tests/check_array.c index 3be4904784..1eeb7ae4c4 100644 --- a/tests/check_array.c +++ b/tests/check_array.c @@ -267,7 +267,7 @@ START_TEST(test_take1_offset) { ga_assert_ok(GpuArray_empty(&r, ctx, GA_UINT, 1, out_dims, GA_C_ORDER)); /* Fake subtensor for offset */ - i.offset = 8; + i.offset += 8; i.dimensions[0] = 2; GpuArray_fix_flags(&i); From 0bcfef357e235c270dea17aa6b71f46aaa9decf7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 18:33:32 -0400 Subject: [PATCH 448/597] Fix macro definitons in cluda-cuda. --- src/cluda_cuda.h | 16 +- src/cluda_cuda.h.c | 410 ++++++++++++++++++++++----------------------- 2 files changed, 211 insertions(+), 215 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index a4222c38d6..5de8fefd71 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -80,13 +80,13 @@ struct ga_half { #define gen_atom64_xchg(name, argtype) gen_atom_xchg(name, argtype, unsigned long long) /* ga_int */ -#define atom_add_ig(a, b, c) atomicAdd(a, b, c) -#define atom_add_il(a, b, c) atomicAdd(a, b, c) +#define atom_add_ig(a, b) atomicAdd(a, b) +#define atom_add_il(a, b) atomicAdd(a, b) #define atom_xchg_ig(a, b) atomicExch(a, b) #define atom_xchg_il(a, b) atomicExch(a, b) /* ga_uint */ -#define atom_add_Ig(a, b, c) atomicAdd(a, b, c) -#define atom_add_Il(a, b, c) atomicAdd(a, b, c) +#define atom_add_Ig(a, b) atomicAdd(a, b) +#define atom_add_Il(a, b) atomicAdd(a, b) #define atom_xchg_Ig(a, b) atomicExch(a, b) #define atom_xchg_Il(a, b) atomicExch(a, b) /* ga_long */ @@ -95,13 +95,13 @@ gen_atom64_add(atom_add_lg, ga_long) gen_atom64_xchg(atom_xchg_lg, ga_long) #define atom_xchg_ll(a, b) atom_xchg_lg(a, b) /* ga_ulong */ -#define atom_add_Lg(a, b, c) atomicAdd(a, b, c) -#define atom_add_Ll(a, b, c) atomicAdd(a, b, c) +#define atom_add_Lg(a, b) atomicAdd(a, b) +#define atom_add_Ll(a, b) atomicAdd(a, b) #define atom_xchg_Lg(a, b) atomicExch(a, b) #define atom_xchg_Ll(a, b) atomicExch(a, b) /* ga_float */ -#define atom_add_fg(a, b, c) atomicAdd(a, b, c) -#define atom_add_fl(a, b, c) atomicAdd(a, b, c) +#define atom_add_fg(a, b) atomicAdd(a, b) +#define atom_add_fl(a, b) atomicAdd(a, b) #define atom_xchg_fg(a, b) atomicExch(a, b) #define atom_xchg_fl(a, b) atomicExch(a, b) /* ga_double */ diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 2421a5b9ce..ec1827d752 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -264,221 +264,217 @@ static const char cluda_cuda_h[] = { 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, -0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, -0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, -0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, -0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, +0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, -0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, -0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, -0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, -0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, -0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, -0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, -0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, +0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, +0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, +0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, +0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, -0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, -0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, -0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, -0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, -0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, -0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, -0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, -0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, -0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, -0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, -0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, -0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, -0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, -0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, -0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, -0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, -0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, -0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, -0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, -0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, -0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, -0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, -0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, -0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, -0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, -0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, -0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, -0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, -0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, -0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, -0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, -0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, -0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, -0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, -0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, +0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, +0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, +0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, +0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, +0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, -0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, -0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, -0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, -0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, -0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, -0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, -0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, +0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, +0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, +0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, +0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, +0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, +0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, +0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, +0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, +0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, +0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, +0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, +0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, +0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, +0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, +0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x00}; +0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, +0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, +0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, +0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, +0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, +0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, +0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, +0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, +0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, +0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, +0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, +0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, +0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, +0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x00}; From 46c6ce5ec2102c6beeb75a2a40d8ec0ddfa69c28 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 19:25:00 -0400 Subject: [PATCH 449/597] Make the test actually use the specified device. --- tests/device.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/device.c b/tests/device.c index 802bd0bcd6..5bfe17f5ad 100644 --- a/tests/device.c +++ b/tests/device.c @@ -9,11 +9,11 @@ char* dev_name = NULL; -int get_env_dev(const char **name) { +int get_env_dev(const char **name, gpucontext_props *p) { char *dev = NULL; char *end; long no; - int d; + int pl; dev = dev_name; if (dev == NULL) { if ((dev = getenv("GPUARRAY_TEST_DEVICE")) == NULL) { @@ -30,7 +30,8 @@ int get_env_dev(const char **name) { return -1; if (no < 0 || no > INT_MAX) return -1; - return (int)no; + gpucontext_props_cuda_dev(p, (int)no); + return 0; } if (strncmp(dev, "opencl", 6) == 0) { *name = "opencl"; @@ -39,16 +40,15 @@ int get_env_dev(const char **name) { return -1; if (no < 0 || no > 32768) return -1; - d = (int)no; + pl = (int)no; dev = end; no = strtol(dev + 1, &end, 10); if (end == dev || *end != '\0') return -1; if (no < 0 || no > 32768) return -1; - d <<= 16; - d |= (int)no; - return d; + gpucontext_props_opencl_dev(p, pl, (int)no); + return 0; } return -1; } @@ -57,10 +57,10 @@ gpucontext *ctx; void setup(void) { const char *name = NULL; - int dev = get_env_dev(&name); - if (dev == -1) - ck_abort_msg("Bad test device"); - ck_assert_int_eq(gpucontext_init(&ctx, name, NULL), GA_NO_ERROR); + gpucontext_props *p; + ck_assert_int_eq(gpucontext_props_new(&p), GA_NO_ERROR); + ck_assert_int_eq(get_env_dev(&name, p), 0); + ck_assert_int_eq(gpucontext_init(&ctx, name, p), GA_NO_ERROR); ck_assert_ptr_ne(ctx, NULL); } From daa2ca9dd9008a05e7d6350907b4787b57ab5fb5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 10 Aug 2017 19:58:28 -0400 Subject: [PATCH 450/597] Fix lots of broken things in the cluda opencl header. --- src/cluda_opencl.h | 90 +++-- src/cluda_opencl.h.c | 763 ++++++++++++++++++++----------------------- 2 files changed, 399 insertions(+), 454 deletions(-) diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 60d3b464f8..0afa70bd50 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -40,13 +40,14 @@ #define GA_DECL_SHARED_BODY(type, name) #define GA_WARP_SIZE __GA_WARP_SIZE -struct ga_half { +typedef struct _ga_half { half data; -}; +} ga_half; #pragma OPENCL_EXTENSION cl_khr_int64_base_atomics: enable #define gen_atom32_add(name, argtype, aspace) \ + argtype name(volatile aspace argtype *, argtype); \ argtype name(volatile aspace argtype *addr, argtype val) { \ union { \ argtype a; \ @@ -57,12 +58,13 @@ struct ga_half { do { \ a = p.w; \ n.a = p.a + val; \ - p.w = atomic_cmpxhg((volatile aspace int *)addr, a, n.w); \ + p.w = atomic_cmpxchg((volatile aspace int *)addr, a, n.w); \ } while (p.w != a); \ return n.a; \ } #define gen_atom64_add(name, argtype, aspace) \ + argtype name(volatile aspace argtype *, argtype); \ argtype name(volatile aspace argtype *addr, argtype val) { \ union { \ argtype a; \ @@ -73,19 +75,20 @@ struct ga_half { do { \ a = p.w; \ n.a = p.a + val; \ - p.w = atom_cmpxhg((volatile aspace long *)addr, a, n.w); \ + p.w = atom_cmpxchg((volatile aspace long *)addr, a, n.w); \ } while (p.w != a); \ return n.a; \ } #define gen_atom64_xchg(name, argtype, aspace) \ + argtype name(volatile aspace argtype *, argtype); \ argtype name(volatile aspace argtype *addr, argtype val) { \ union { \ argtype a; \ long w; \ } p, n; \ n.a = val; \ - p.w = atom_xchg((volatile aspace wtype *)addr, n.w); \ + p.w = atom_xchg((volatile aspace long *)addr, n.w); \ return p.a; \ } @@ -120,57 +123,46 @@ gen_atom64_add(atom_add_dl, ga_double, local) gen_atom64_xchg(atom_xchg_dg, ga_double, global) gen_atom64_xchg(atom_xchg_dl, ga_double, local) /* ga_half */ -#define gen_atomh_add(name, aspace) \ +#define gen_atomh_add(name, aspace) \ + ga_half name(volatile aspace ga_half *addr, ga_half val); \ ga_half name(volatile aspace ga_half *addr, ga_half val) { \ - ga_size off = (ga_size)addr & 2; \ - volatile aspace int *base = (volatile aspace int *)((ga_size)addr - off); \ - int o, a, n; \ + ga_uint idx = ((ga_size)addr & 2) >> 1; \ + volatile aspace int *base = (volatile aspace int *)((ga_size)addr & ~2); \ + union { \ + int i; \ + ga_half h[2]; \ + } o, a, n; \ float fo; \ float fval; \ - ga_half hn; \ - fval = vload_half(0, &val->data); \ - o = *base; \ + fval = load_half(&val); \ + o.i = *base; \ do { \ - a = o; \ - /* This loads the half of `o` that we want to update */ \ - fo = vload_half(off, (__private half *)&o); \ - /* We compute the half addition in float 32 */ \ - store_half(fval + fo, &hn); \ - /* Now we reassemble the the parts to form a 32-bits n */ \ - if (off == 2) \ - n = (int)hn->data << 16 & (o & 0xffff); \ - else \ - n = (int)hn->data & (o & 0xffff0000); \ - o = atomic_cmpxchg(base, a, n); \ - } while (o != a); \ - if (off == 2) \ - hn->data = (ushort)(o >> 16); \ - else \ - hn->data = (ushort)(o & 0xffff); \ - return hn; \ + a.i = o.i; \ + fo = load_half(&o.h[idx]); \ + n.i = o.i; \ + store_half(&n.h[idx], fval + fo); \ + o.i = atomic_cmpxchg(base, a.i, n.i); \ + } while (o.i != a.i); \ + return n.h[idx]; \ } -#define gen_atomh_xchg(name, aspace) \ - ga_half name(volatile aspace ga_half *addr, ga_half *val) { \ - ga_size off = (ga_size)addr & 2; \ - volatile aspace int *base = (volatile aspace int *)((ga_size)addr - off); \ - int o, a, n; \ - ga_half hr; \ - o = *base; \ +#define gen_atomh_xchg(name, aspace) \ + ga_half name(volatile aspace ga_half *addr, ga_half val); \ + ga_half name(volatile aspace ga_half *addr, ga_half val) { \ + ga_uint idx = ((ga_size)addr & 2) >> 1; \ + volatile aspace int *base = (volatile aspace int *)((ga_size)addr & ~2); \ + union { \ + int i; \ + ga_half h[2]; \ + } o, a, n; \ + o.i = *base; \ do { \ - a = o; \ - /* we have to combine our half value with the right part of `o` */ \ - if (off == 2) \ - n = (int)val->data << 16 & (o & 0xffff); \ - else \ - n = (int)val->data & (o & 0xffff0000); \ - o = atomic_cmpxchg(base, a, n); \ - } while (o != a); \ - if (off == 2) \ - hr->data = (ushort)o << 16; \ - else \ - hr->data = (ushort)o & 0xffff; \ - return hr; \ + a.i = o.i; \ + n.i = o.i; \ + n.h[idx] = val; \ + o.i = atomic_cmpxchg(base, a.i, n.i); \ + } while (o.i != a.i); \ + return o.h[idx]; \ } gen_atomh_add(atom_add_hg, global) diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index 1328e619a8..f394550f6e 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -106,530 +106,506 @@ static const char cluda_opencl_h[] = { 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, -0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, -0x3b, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, -0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, -0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, -0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, -0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, -0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, -0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x0a, 0x0a, 0x74, 0x79, 0x70, 0x65, 0x64, 0x65, 0x66, 0x20, 0x73, +0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x5f, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, 0x0a, 0x0a, 0x23, 0x70, +0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, +0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, +0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, +0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, +0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, +0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, +0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, +0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, +0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, +0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, +0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, -0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, +0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, -0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, +0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, +0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x68, 0x67, -0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, -0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, -0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, +0x77, 0x29, 0x3b, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, +0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, -0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, +0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, -0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, +0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, +0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, -0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, +0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, -0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, +0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x68, 0x67, 0x28, 0x28, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, -0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, -0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, +0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, -0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, -0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, -0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, -0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, -0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, -0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, +0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, -0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, -0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, -0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, +0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, +0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x6c, 0x6f, 0x63, +0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, -0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, -0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, -0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, -0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, -0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, -0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, -0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, -0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6f, -0x66, 0x66, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, -0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, +0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, +0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, +0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, -0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, -0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, -0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x2d, -0x20, 0x6f, 0x66, 0x66, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, -0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, +0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, +0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, +0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, +0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, +0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, +0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, +0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, +0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x68, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, 0x76, 0x6c, 0x6f, -0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, -0x26, 0x76, 0x61, 0x6c, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x20, -0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x2f, 0x2a, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x6c, -0x6f, 0x61, 0x64, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x6f, 0x66, 0x20, 0x60, 0x6f, 0x60, 0x20, 0x74, -0x68, 0x61, 0x74, 0x20, 0x77, 0x65, 0x20, 0x77, 0x61, 0x6e, 0x74, -0x20, 0x74, 0x6f, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, -0x2a, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, -0x6f, 0x20, 0x3d, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x6f, 0x66, 0x66, 0x2c, 0x20, 0x28, 0x5f, -0x5f, 0x70, 0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x2a, 0x29, 0x26, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x57, -0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x20, 0x74, -0x68, 0x65, 0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x64, 0x64, -0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x20, 0x33, 0x32, 0x20, 0x2a, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, -0x66, 0x6f, 0x2c, 0x20, 0x26, 0x68, 0x6e, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, +0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, +0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x2f, 0x2a, 0x20, 0x4e, 0x6f, 0x77, 0x20, 0x77, 0x65, 0x20, -0x72, 0x65, 0x61, 0x73, 0x73, 0x65, 0x6d, 0x62, 0x6c, 0x65, 0x20, -0x74, 0x68, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x61, 0x72, -0x74, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, -0x61, 0x20, 0x33, 0x32, 0x2d, 0x62, 0x69, 0x74, 0x73, 0x20, 0x6e, -0x20, 0x2a, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, -0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, +0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x20, 0x3d, -0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, 0x68, 0x6e, 0x2d, 0x3e, 0x64, -0x61, 0x74, 0x61, 0x20, 0x3c, 0x3c, 0x20, 0x31, 0x36, 0x20, 0x26, -0x20, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, 0x66, 0x66, -0x66, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, -0x68, 0x6e, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x26, 0x20, -0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, 0x66, 0x66, 0x66, -0x30, 0x30, 0x30, 0x30, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, +0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x20, 0x3d, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, -0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, -0x61, 0x2c, 0x20, 0x6e, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, -0x28, 0x6f, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x6c, +0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, 0x6f, +0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x69, 0x66, 0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x3d, 0x20, -0x32, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, -0x6e, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x28, -0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x29, 0x28, 0x6f, 0x20, 0x3e, -0x3e, 0x20, 0x31, 0x36, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, +0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, +0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x2c, 0x20, 0x66, +0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, +0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, +0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, +0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x68, 0x6e, 0x2d, 0x3e, 0x64, 0x61, 0x74, -0x61, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, -0x29, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, 0x66, 0x66, -0x66, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, +0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, @@ -638,136 +614,113 @@ static const char cluda_opencl_h[] = { 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, -0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x2a, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, -0x65, 0x20, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, -0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, -0x26, 0x20, 0x32, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x2d, 0x20, 0x6f, 0x66, 0x66, 0x29, 0x3b, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x2c, -0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, -0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, +0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, +0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, -0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x6f, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, +0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, -0x76, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x63, 0x6f, 0x6d, 0x62, 0x69, -0x6e, 0x65, 0x20, 0x6f, 0x75, 0x72, 0x20, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x77, 0x69, 0x74, 0x68, -0x20, 0x74, 0x68, 0x65, 0x20, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, -0x70, 0x61, 0x72, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x60, 0x6f, 0x60, -0x20, 0x2a, 0x2f, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x69, 0x66, 0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, 0x3d, 0x3d, -0x20, 0x32, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, +0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x6e, 0x20, 0x3d, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x29, 0x76, 0x61, -0x6c, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3c, 0x3c, 0x20, -0x31, 0x36, 0x20, 0x26, 0x20, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, -0x78, 0x66, 0x66, 0x66, 0x66, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x69, -0x6e, 0x74, 0x29, 0x76, 0x61, 0x6c, 0x2d, 0x3e, 0x64, 0x61, 0x74, -0x61, 0x20, 0x26, 0x20, 0x28, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, -0x66, 0x66, 0x66, 0x66, 0x30, 0x30, 0x30, 0x30, 0x29, 0x3b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x6f, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, -0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, -0x65, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, +0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, -0x6c, 0x65, 0x20, 0x28, 0x6f, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6f, 0x66, 0x66, 0x20, -0x3d, 0x3d, 0x20, 0x32, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, +0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x68, 0x72, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, -0x3d, 0x20, 0x28, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x29, 0x6f, -0x20, 0x3c, 0x3c, 0x20, 0x31, 0x36, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, +0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, 0x73, -0x65, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, +0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, +0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, +0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x72, 0x2d, 0x3e, -0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x73, 0x68, -0x6f, 0x72, 0x74, 0x29, 0x6f, 0x20, 0x26, 0x20, 0x30, 0x78, 0x66, -0x66, 0x66, 0x66, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, +0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, +0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x72, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f, 0x2e, +0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, From a4a88a1a29b7f3cb1ed916a9e30b12fcbeb98682 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Sat, 12 Aug 2017 12:22:24 -0400 Subject: [PATCH 451/597] Fix OpenCL crashes on OS X and make header inclusion work. --- src/gpuarray_array.c | 4 ++-- src/gpuarray_buffer_opencl.c | 24 ++++++++++++++++++++---- src/loaders/libopencl.fn | 2 +- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 9d1aecb13c..d9b41e0eee 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -421,13 +421,13 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, " %s ii0 = ind[i0];\n" " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" - " if ((ii0 < 0) || (ii0 >= d0)) {\n" + " if ((ii0 < 0) || (ii0 >= (%s)d0)) {\n" " *err = -1;\n" " continue;\n" " }\n" " pos0 += ii0 * (%s)s0;\n" " for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n" - " %s p = pos0;\n", ssz, sz, sz, sz); + " %s p = pos0;\n", ssz, sz, ssz, sz, sz); if (v->nd > 1) { strb_appendf(&sb, " %s pos, ii = i1;\n", sz); for (i2 = v->nd; i2 > 1; i2--) { diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 3dc2d3c8ee..6278e39f40 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -113,6 +113,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { cl_command_queue_properties qprop; char vendor[32]; char driver_version[64]; + char device_version[32]; cl_uint vendor_id; cl_int err; size_t len; @@ -120,7 +121,7 @@ cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { int e = 0; size_t warp_size; int ret; - const char dummy_kern[] = "__kernel void kdummy(float f) {}\n"; + const char dummy_kern[] = "__kernel void kdummy(__global float *f) { f[0] = 0; }\n"; strb context_preamble = STRB_STATIC_INIT; const char *rlk[1]; gpukernel *m; @@ -130,9 +131,18 @@ cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { return NULL; id = get_dev(ctx, global_err); if (id == NULL) return NULL; + + CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VERSION, + sizeof(device_version), + &device_version, NULL)); + if (device_version[7] == '1' && device_version[9] < '2') { + error_set(global_err, GA_UNSUPPORTED_ERROR, + "We only support OpenCL 1.2 and up"); + return NULL; + } + CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(qprop), &qprop, NULL)); - CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL)); CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VENDOR_ID, @@ -782,6 +792,7 @@ static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, cl_device_id dev; cl_program p; cl_program cluda; + cl_program tmp; // Sync this table size with the number of flags that can add stuff // at the beginning const char *preamble[5]; @@ -850,10 +861,15 @@ static int cl_newkernel(gpukernel **k, gpucontext *c, unsigned int count, if (err != CL_SUCCESS) goto compile_error; - err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); + tmp = clLinkProgram(ctx->ctx, 0, NULL, NULL, 1, &p, NULL, NULL, &err); + if (tmp != NULL) { + clReleaseProgram(p); + p = tmp; + tmp = NULL; + } compile_error: if (err != CL_SUCCESS) { - if ((err == CL_COMPILE_PROGRAM_FAILURE || err == CL_BUILD_PROGRAM_FAILURE) + if ((err == CL_COMPILE_PROGRAM_FAILURE || err == CL_LINK_PROGRAM_FAILURE) && err_str != NULL) { *err_str = NULL; // Fallback, in case there's an error diff --git a/src/loaders/libopencl.fn b/src/loaders/libopencl.fn index 26040501df..c86d3b02d6 100644 --- a/src/loaders/libopencl.fn +++ b/src/loaders/libopencl.fn @@ -1,6 +1,6 @@ DEF_PROC(cl_context, clCreateContext, (const cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *)); -DEF_PROC(cl_int, clBuildProgram, (cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *)); DEF_PROC(cl_int, clCompileProgram, (cl_program, cl_uint, const cl_device_id *, const char *, cl_uint, cl_program *, const char **, void (CL_CALLBACK *)(cl_program, void *), void *)); +DEF_PROC(cl_program, clLinkProgram, (cl_context, cl_uint, const cl_device_id *, const char *, cl_uint, const cl_program *, void (CL_CALLBACK *)(cl_program, void *), void *, cl_int *)); DEF_PROC(cl_mem, clCreateBuffer, (cl_context, cl_mem_flags, size_t, void *, cl_int *)); DEF_PROC(cl_command_queue, clCreateCommandQueue, (cl_context, cl_device_id, cl_command_queue_properties, cl_int *)); DEF_PROC(cl_kernel, clCreateKernel, (cl_program, const char *, cl_int *)); From 5f4acd010e7d67b82b1760abcac5f0939eebbc61 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Aug 2017 14:28:45 -0400 Subject: [PATCH 452/597] Ensure we don't try to allocate 0 bytes. --- src/gpuarray_buffer_cuda.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index eade62f8cd..86e7b4d2cb 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -708,6 +708,8 @@ static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags) { cuda_context *ctx = (cuda_context *)c; size_t asize; + if (size == 0) size = 1; + if ((flags & GA_BUFFER_INIT) && data == NULL) { error_set(ctx->err, GA_VALUE_ERROR, "Requested buffer initialisation but no data given"); return NULL; From ba6f424b6fb6e5fb9a857f3e9b1db11c4695842a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Aug 2017 17:16:13 -0400 Subject: [PATCH 453/597] Change the definition of store_half to return a ga_half struct. --- src/cluda_cuda.h | 9 +- src/cluda_cuda.h.c | 651 +++++++++++++++---------------- src/cluda_opencl.h | 11 +- src/cluda_opencl.h.c | 835 ++++++++++++++++++++-------------------- src/gpuarray_elemwise.c | 4 +- 5 files changed, 765 insertions(+), 745 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index 5de8fefd71..60d64f64c7 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -38,8 +38,6 @@ #define ga_double double #define ga_size size_t #define ga_ssize ptrdiff_t -#define load_half(p) __half2float((p)->data) -#define store_half(p, v) ((p)->data = __float2half_rn(v)) #define GA_DECL_SHARED_PARAM(type, name) #define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[]; #define GA_WARP_SIZE warpSize @@ -48,6 +46,13 @@ struct ga_half { ga_ushort data; }; +#define load_half(p) __half2float((p)->data) +__device__ static inline ga_half store_half(float f) { + ga_half r; + r.data = __float2half_rn(f); + return r; +} + #define gen_atom_add(name, argtype, wtype) \ __device__ argtype name(argtype *addr, argtype val) { \ union { \ diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index ec1827d752..6ebb4987d8 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -93,388 +93,393 @@ static const char cluda_cuda_h[] = { 0x20, 0x73, 0x69, 0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, -0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, -0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, -0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, -0x61, 0x74, 0x28, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, -0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, -0x70, 0x2c, 0x20, 0x76, 0x29, 0x20, 0x28, 0x28, 0x70, 0x29, 0x2d, -0x3e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, -0x6e, 0x28, 0x76, 0x29, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, -0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, -0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, -0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, -0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x65, -0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, -0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, -0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, -0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x77, 0x61, 0x72, 0x70, -0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, -0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, -0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, +0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, +0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, +0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, +0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, +0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, +0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, +0x6e, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, +0x5f, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, +0x5a, 0x45, 0x20, 0x77, 0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, +0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, +0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, 0x70, +0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x5f, 0x5f, +0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x73, 0x74, +0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x73, 0x74, +0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, +0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, +0x66, 0x5f, 0x72, 0x6e, 0x28, 0x66, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, +0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, -0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, +0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, -0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, +0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, -0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, +0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, +0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, +0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, -0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, -0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x28, 0x77, 0x74, 0x79, +0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x70, 0x2e, 0x77, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, +0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x2e, 0x77, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, -0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x70, -0x2e, 0x77, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, -0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x2e, 0x77, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, -0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, -0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, -0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, -0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, -0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, -0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, -0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, +0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, -0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, +0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, -0x6e, 0x2c, 0x20, 0x70, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, +0x20, 0x6e, 0x2c, 0x20, 0x70, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, -0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, +0x28, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, -0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, +0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, -0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, -0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, -0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, -0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, +0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, +0x20, 0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, +0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, +0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, +0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, +0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, +0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, +0x36, 0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, -0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, -0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, -0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, -0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, +0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, -0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, -0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, -0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, -0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, -0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, -0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, -0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, -0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, -0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, -0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, -0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, -0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, -0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, -0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, -0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, -0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, -0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, -0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, -0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, -0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, -0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, -0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, -0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, -0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, -0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, -0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, +0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, +0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, +0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, +0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, +0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, +0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, +0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, +0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, +0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, +0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, +0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, +0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, +0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, +0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, +0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, +0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, +0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x28, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, +0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, +0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, +0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, +0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, +0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, +0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, +0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, +0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, +0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, +0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, +0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, +0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, +0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, +0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, +0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, +0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, +0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, +0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, +0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, +0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, +0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, +0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, +0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, +0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, -0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, -0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, -0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, -0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, -0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, -0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, -0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, -0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, -0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, -0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, -0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, -0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, -0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, -0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x00}; +0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, +0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, +0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, +0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, +0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, +0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, +0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x00}; diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 0afa70bd50..2936d7d1c9 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -34,8 +34,6 @@ #define ga_double double #define ga_size ulong #define ga_ssize long -#define load_half(p) vload_half(0, &(p)->data) -#define store_half(p, v) vstore_half_rtn(v, 0, &(p)->data) #define GA_DECL_SHARED_PARAM(type, name) , __local type *name #define GA_DECL_SHARED_BODY(type, name) #define GA_WARP_SIZE __GA_WARP_SIZE @@ -44,6 +42,13 @@ typedef struct _ga_half { half data; } ga_half; +#define load_half(p) vload_half(0, &(p)->data) +static inline ga_half store_half(ga_float f) { + ga_half r; + vstore_half_rtn(f, 0, &r.data); + return r; +} + #pragma OPENCL_EXTENSION cl_khr_int64_base_atomics: enable #define gen_atom32_add(name, argtype, aspace) \ @@ -140,7 +145,7 @@ gen_atom64_xchg(atom_xchg_dl, ga_double, local) a.i = o.i; \ fo = load_half(&o.h[idx]); \ n.i = o.i; \ - store_half(&n.h[idx], fval + fo); \ + n.h[idx] = store_half(fval + fo); \ o.i = atomic_cmpxchg(base, a.i, n.i); \ } while (o.i != a.i); \ return n.h[idx]; \ diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index f394550f6e..17fb900b36 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -84,658 +84,663 @@ static const char cluda_opencl_h[] = { 0x69, 0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, -0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, -0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x73, -0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, -0x2c, 0x20, 0x76, 0x29, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x76, -0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, -0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, +0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, +0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, +0x2a, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, -0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, -0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, -0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, -0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, -0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, -0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, -0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, -0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, +0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, +0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, -0x0a, 0x0a, 0x74, 0x79, 0x70, 0x65, 0x64, 0x65, 0x66, 0x20, 0x73, -0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x5f, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, 0x0a, 0x0a, 0x23, 0x70, -0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, -0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, -0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, -0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, -0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, -0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, -0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, +0x53, 0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x74, 0x79, 0x70, 0x65, 0x64, +0x65, 0x66, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x5f, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, +0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, +0x0a, 0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, +0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, +0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, +0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, +0x61, 0x74, 0x61, 0x29, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, +0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, +0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, +0x20, 0x26, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, +0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, +0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, +0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, +0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, +0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, +0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, +0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, -0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, +0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, +0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, +0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, -0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, -0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, -0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, +0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, -0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, -0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, -0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, -0x77, 0x29, 0x3b, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, -0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, +0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, -0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, +0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, +0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, +0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, -0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, +0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, -0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, +0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, +0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, +0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, -0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, +0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, -0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, -0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, +0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, -0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, -0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, -0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, +0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, +0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, +0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, +0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, -0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, +0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, +0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, +0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, +0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, -0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, -0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, -0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, -0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, -0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x6c, 0x6f, 0x63, -0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, +0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, +0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x66, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, +0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, -0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, -0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, -0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, +0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, -0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, -0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, -0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, -0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, +0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, +0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, +0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, +0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, +0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, -0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, -0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, -0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, -0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, -0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, +0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, +0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, +0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, +0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, -0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, -0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, -0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, +0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, -0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, +0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, +0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x76, 0x61, +0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, +0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, +0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, -0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, -0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, -0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, +0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x66, 0x6f, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x26, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, +0x78, 0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x6c, -0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, 0x6f, -0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x29, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, +0x78, 0x5d, 0x20, 0x3d, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, +0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, +0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, +0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, +0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, +0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, -0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, -0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x2c, 0x20, 0x66, -0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, -0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, -0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, -0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, -0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, -0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, +0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, +0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, +0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, +0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, +0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, +0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, +0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, +0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, -0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, -0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, -0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, +0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, +0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, -0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, -0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, +0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, -0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, +0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, -0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, +0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, +0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, -0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, +0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, -0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, -0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, -0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, -0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, -0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, +0x75, 0x72, 0x6e, 0x20, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, +0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f, 0x2e, -0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, -0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x2c, 0x20, +0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, +0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, -0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, -0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x2c, 0x20, 0x67, -0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, -0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x00}; +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x29, 0x0a, 0x00}; diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index c083a6420f..e9bf84cd94 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -227,7 +227,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, for (j = 0; j < n; j++) { if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { - strb_appendf(&sb, "store_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p), %s);\n", + strb_appendf(&sb, "*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p) = store_half(%s);\n", args[j].name, args[j].name, args[j].name); } else { strb_appendf(&sb, "*(GLOBAL_MEM %s *)(((GLOBAL_MEM char *)%s_data) + %s_p) = %s;\n", @@ -522,7 +522,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, if (is_array(args[j])) { if (ISSET(args[j].flags, GE_WRITE)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { - strb_appendf(&sb, "store_half(&%s_p[i], %s);\n", args[j].name, args[j].name); + strb_appendf(&sb, "%s_p[i] = store_half(%s);\n", args[j].name, args[j].name); } else { strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name); } From eee574a8871a92359da3116dc24afa5e0933f251 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Aug 2017 17:39:56 -0400 Subject: [PATCH 454/597] Switch to quotes for compatibility with OpenCL. --- pygpu/reduction.py | 2 +- src/gpuarray_array.c | 2 +- src/gpuarray_blas_cuda_cublas.c | 12 ++++++------ src/gpuarray_elemwise.c | 4 ++-- src/gpuarray_reduction.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pygpu/reduction.py b/pygpu/reduction.py index df9482a072..8726006d19 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -31,7 +31,7 @@ def _ceil_log2(x): basic_kernel = Template(""" -#include +#include "cluda.h" ${preamble} diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index d9b41e0eee..68fc52da36 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -384,7 +384,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, } apos = 0; - strb_appendf(&sb, "#include \n" + strb_appendf(&sb, "#include \"cluda.h\"\n" "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 7a7c593ebf..49a543d8eb 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -77,7 +77,7 @@ typedef struct _blas_handle { #define LARGE_VAL(v) (v >= INT_MAX) static const char *code_sgemvBH_N_a1_b1_small = \ - "#include \n" \ + "#include \"cluda.h\"\n" \ "KERNEL void sgemv(const float *A[], size_t lda, " \ " const float *x[], size_t incx, " \ " float *y[], size_t incy, " \ @@ -101,7 +101,7 @@ static const char *code_sgemvBH_N_a1_b1_small = \ "}\n"; static const char *code_sgemvBH_T_a1_b1_small = \ - "#include \n" \ + "#include \"cluda.h\"\n" \ "KERNEL void sgemv(const float *A[], size_t lda, " \ " const float *x[], size_t incx, " \ " float *y[], size_t incy, " \ @@ -121,7 +121,7 @@ static const char *code_sgemvBH_T_a1_b1_small = \ "}\n"; static const char *code_dgemvBH_N_a1_b1_small = \ - "#include \n" \ + "#include \"cluda.h\"\n" \ "KERNEL void dgemv(const double *A[], size_t lda, " \ " const double *x[], size_t incx, " \ " double *y[], size_t incy, " \ @@ -145,7 +145,7 @@ static const char *code_dgemvBH_N_a1_b1_small = \ "}\n"; static const char *code_dgemvBH_T_a1_b1_small = \ - "#include \n" \ + "#include \"cluda.h\"\n" \ "KERNEL void dgemv(const double *A[], size_t lda, " \ " const double *x[], size_t incx, " \ " double *y[], size_t incy, " \ @@ -165,7 +165,7 @@ static const char *code_dgemvBH_T_a1_b1_small = \ "}\n"; static const char *code_sgerBH_gen_small = \ - "#include \n" \ + "#include \"cluda.h\"\n" \ "KERNEL void _sgerBH_gen_small(" \ " const float *x[], size_t incx," \ " const float *y[], size_t incy," \ @@ -181,7 +181,7 @@ static const char *code_sgerBH_gen_small = \ "}\n"; static const char *code_dgerBH_gen_small = \ - "#include \n" \ + "#include \"cluda.h\"\n" \ "KERNEL void _dgerBH_gen_small(" \ " const double *x[], size_t incx, " \ " const double *y[], size_t incy," \ diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index e9bf84cd94..9addb2ec2a 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -154,7 +154,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, p = 0; - strb_appends(&sb, "#include \n"); + strb_appends(&sb, "#include \"cluda.h\"\n"); if (preamble) strb_appends(&sb, preamble); strb_appends(&sb, "\nKERNEL void elem(const ga_size n, "); @@ -469,7 +469,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, p = 0; - strb_appends(&sb, "#include \n"); + strb_appends(&sb, "#include \"cluda.h\"\n"); if (preamble) strb_appends(&sb, preamble); strb_appends(&sb, "\nKERNEL void elem(const ga_size n, "); diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 6cccaaba97..fc4fc56975 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -322,7 +322,7 @@ static int maxandargmaxGenSource (maxandargmax_ctx* ctx){ return ctx->ret=GA_NO_ERROR; } static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx){ - strb_appends (&ctx->s, "#include \n"); + strb_appends (&ctx->s, "#include \"cluda.h\"\n"); maxandargmaxAppendTypedefs (ctx); maxandargmaxAppendPrototype (ctx); strb_appends (&ctx->s, "{\n"); From 9761a4545d701e3db289b1f449e1c3af0ca9296c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Aug 2017 18:38:35 -0400 Subject: [PATCH 455/597] Standardize and use less confusing names. --- src/cluda_cuda.h | 4 +- src/cluda_cuda.h.c | 612 ++++++++++++++++---------------- src/cluda_opencl.h | 10 +- src/cluda_opencl.h.c | 765 ++++++++++++++++++++-------------------- src/gpuarray_elemwise.c | 8 +- 5 files changed, 700 insertions(+), 699 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index 60d64f64c7..985a6ac4e4 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -46,8 +46,8 @@ struct ga_half { ga_ushort data; }; -#define load_half(p) __half2float((p)->data) -__device__ static inline ga_half store_half(float f) { +#define ga_half2float(p) __half2float((p).data) +__device__ static inline ga_half ga_float2half(float f) { ga_half r; r.data = __float2half_rn(f); return r; diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 6ebb4987d8..9cefe7ce36 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -110,280 +110,338 @@ static const char cluda_cuda_h[] = { 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, 0x70, -0x29, 0x2d, 0x3e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x5f, 0x5f, -0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x73, 0x74, -0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x73, 0x74, -0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, -0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, -0x66, 0x5f, 0x72, 0x6e, 0x28, 0x66, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, -0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, -0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, +0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x28, 0x28, 0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, +0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, +0x20, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, +0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x2e, 0x64, +0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, +0x66, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, +0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, +0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, +0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, +0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, -0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, -0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, -0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, +0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, -0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, -0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x28, 0x77, 0x74, 0x79, -0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x70, 0x2e, 0x77, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, -0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x2e, 0x77, -0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, +0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, -0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x29, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, -0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, +0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, +0x53, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x70, 0x2e, 0x77, 0x2c, 0x20, +0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, +0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, +0x21, 0x3d, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x74, -0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, +0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, +0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, +0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, +0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, +0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, +0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, +0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, -0x20, 0x6e, 0x2c, 0x20, 0x70, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, +0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, -0x28, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6e, 0x2c, 0x20, 0x70, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, -0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, +0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, +0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, +0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, +0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x33, 0x32, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x78, 0x63, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, -0x20, 0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, -0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, -0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, -0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, +0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, -0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, -0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, -0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, -0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, -0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, -0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, -0x36, 0x30, 0x30, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, +0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, -0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, +0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, +0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, +0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, +0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, +0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, +0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, +0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, +0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x65, 0x6e, +0x64, 0x69, 0x66, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, +0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, +0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, +0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, +0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, +0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, +0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, +0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, +0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, +0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, +0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, +0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, +0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, +0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, +0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, +0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, +0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, +0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, +0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, +0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, -0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, -0x75, 0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, -0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, +0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, +0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, @@ -393,93 +451,35 @@ static const char cluda_cuda_h[] = { 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, -0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, -0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, -0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, -0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, -0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, -0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, -0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, -0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, -0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, -0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, -0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, -0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, -0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x28, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, -0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, -0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, -0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, -0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, -0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, -0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, -0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, -0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, -0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, -0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, -0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, -0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, -0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, -0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, -0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, -0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, -0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, -0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, -0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, -0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, -0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, -0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, -0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, -0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, -0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, -0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, -0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, -0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, -0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, -0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, -0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x00}; +0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, +0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, +0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, +0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, +0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, +0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, +0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, +0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, +0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, +0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, +0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x00}; diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 2936d7d1c9..0d384429d4 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -42,8 +42,8 @@ typedef struct _ga_half { half data; } ga_half; -#define load_half(p) vload_half(0, &(p)->data) -static inline ga_half store_half(ga_float f) { +#define ga_half2float(p) vload_half(0, &((p).data)) +static inline ga_half ga_float2half(ga_float f) { ga_half r; vstore_half_rtn(f, 0, &r.data); return r; @@ -139,13 +139,13 @@ gen_atom64_xchg(atom_xchg_dl, ga_double, local) } o, a, n; \ float fo; \ float fval; \ - fval = load_half(&val); \ + fval = ga_half2float(val); \ o.i = *base; \ do { \ a.i = o.i; \ - fo = load_half(&o.h[idx]); \ + fo = ga_half2float(o.h[idx]); \ n.i = o.i; \ - n.h[idx] = store_half(fval + fo); \ + n.h[idx] = ga_float2half(fval + fo); \ o.i = atomic_cmpxchg(base, a.i, n.i); \ } while (o.i != a.i); \ return n.h[idx]; \ diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index 17fb900b36..b34a216bd8 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -101,646 +101,647 @@ static const char cluda_opencl_h[] = { 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, -0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, -0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x70, 0x29, -0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x70, 0x29, 0x2d, 0x3e, 0x64, -0x61, 0x74, 0x61, 0x29, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, -0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, -0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, -0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, -0x20, 0x26, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, -0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, -0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, -0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, -0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, -0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, -0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, -0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, -0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, -0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, -0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x28, +0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x0a, 0x73, +0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, +0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, +0x66, 0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, +0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x76, 0x73, +0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, +0x74, 0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x72, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, +0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, +0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, +0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, +0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, +0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, +0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, +0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, -0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, +0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, -0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, +0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, -0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, -0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, -0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, +0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, +0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, +0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, -0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, -0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, -0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, -0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, +0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, +0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, +0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, +0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, -0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, -0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, -0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, -0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, -0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, -0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, -0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, +0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, +0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, +0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, +0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, -0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, -0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, +0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, +0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, +0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, -0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, -0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, +0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, -0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, +0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, -0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, -0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, -0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x66, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, +0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, +0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, +0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, -0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, -0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, -0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, -0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, -0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, +0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, +0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, +0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, -0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, -0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, -0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, -0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, -0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, -0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, -0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, -0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, +0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, +0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, +0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, +0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, -0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, -0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, -0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, +0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x76, 0x61, -0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, -0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x28, 0x26, 0x76, 0x61, 0x6c, 0x29, 0x3b, +0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, -0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, +0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, +0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, -0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x66, 0x6f, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x26, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, -0x78, 0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, -0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, +0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, +0x78, 0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, -0x78, 0x5d, 0x20, 0x3d, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, -0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, +0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, -0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, -0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, -0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, -0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, +0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, +0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, +0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, +0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, +0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, +0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, -0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, +0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, +0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, +0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, -0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, -0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, -0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, -0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, -0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, -0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, -0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, +0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, -0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, +0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, +0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, +0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, +0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, +0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, -0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, +0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, -0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, +0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, +0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, -0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, +0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, -0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, +0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, -0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, -0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, +0x78, 0x5d, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, -0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, +0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, +0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, -0x75, 0x72, 0x6e, 0x20, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, -0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, +0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, +0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x2c, 0x20, -0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, -0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x68, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, -0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x29, 0x0a, 0x00}; +0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x7d, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, +0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, +0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, +0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, +0x29, 0x0a, 0x00}; diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 9addb2ec2a..2c51ffa723 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -213,7 +213,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, GA_FLOAT : args[j].typecode), args[j].name); if (ISSET(args[j].flags, GE_READ)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { - strb_appendf(&sb, "%s = load_half((GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n", + strb_appendf(&sb, "%s = ga_half2float(*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p));\n", args[j].name, args[j].name, args[j].name); } else { strb_appendf(&sb, "%s = *(GLOBAL_MEM %s *)(((GLOBAL_MEM char *)%s_data) + %s_p);\n", @@ -227,7 +227,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, for (j = 0; j < n; j++) { if (is_array(args[j]) && ISSET(args[j].flags, GE_WRITE)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { - strb_appendf(&sb, "*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p) = store_half(%s);\n", + strb_appendf(&sb, "*(GLOBAL_MEM ga_half *)(((GLOBAL_MEM char *)%s_data) + %s_p) = ga_float2half(%s);\n", args[j].name, args[j].name, args[j].name); } else { strb_appendf(&sb, "*(GLOBAL_MEM %s *)(((GLOBAL_MEM char *)%s_data) + %s_p) = %s;\n", @@ -508,7 +508,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, GA_FLOAT : args[j].typecode), args[j].name); if (ISSET(args[j].flags, GE_READ)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { - strb_appendf(&sb, "%s = load_half(&%s_p[i]);\n", args[j].name, args[j].name); + strb_appendf(&sb, "%s = ga_half2float(%s_p[i]);\n", args[j].name, args[j].name); } else { strb_appendf(&sb, "%s = %s_p[i];\n", args[j].name, args[j].name); } @@ -522,7 +522,7 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, if (is_array(args[j])) { if (ISSET(args[j].flags, GE_WRITE)) { if (args[j].typecode == GA_HALF && ISSET(gen_flags, GEN_CONVERT_F16)) { - strb_appendf(&sb, "%s_p[i] = store_half(%s);\n", args[j].name, args[j].name); + strb_appendf(&sb, "%s_p[i] = ga_float2half(%s);\n", args[j].name, args[j].name); } else { strb_appendf(&sb, "%s_p[i] = %s;\n", args[j].name, args[j].name); } From 36d17ddf947603d860c3020ab87eb927ff3470f7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 14 Aug 2017 19:17:54 -0400 Subject: [PATCH 456/597] Update the docs to mention the OpenCL 1.2 requirement and add a note specifying that OS X is broken. --- doc/installation.rst | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 880decfa47..a65824493b 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -1,8 +1,8 @@ Installation ============ -The library is routinely tested on OS X and linux and, less -frequently, on Windows. The OS most frequently tested are: +The library is routinely tested on linux and, less frequently, on +Windows and Mac OS X. The OS most frequently tested are: - Debian 6 - Ubuntu 16.04 @@ -55,9 +55,17 @@ functionality. * For OpenCL: - - OpenCL version 1.1 or more + - OpenCL version 1.2 or more - (optional) clBLAS (clblas_) or CLBlast (clblast_) for blas functionality + .. note:: + + The OpenCL that comes with OS X is fundamentally broken and + doesn't work with some of the kernels in the library. You can + use it at your own risk, but don't report problems with it we + can't fix them. + + Download -------- From c32f19068ba1a6180eb13d836934084e7bb53d1d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 16 Aug 2017 12:22:16 -0400 Subject: [PATCH 457/597] Use the same char as numpy for simplicity. --- src/cluda_cuda.h | 8 ++++---- src/cluda_cuda.h.c | 12 ++++++------ src/cluda_opencl.h | 8 ++++---- src/cluda_opencl.h.c | 8 ++++---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index 985a6ac4e4..c32cbfd34c 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -120,7 +120,7 @@ gen_atom64_add(atom_add_dg, ga_double) gen_atom64_xchg(atom_xchg_dg, ga_double) #define atom_xchg_dl(a, b) atom_xchg_dg(a, b) /* ga_half */ -__device__ ga_half atom_add_hg(ga_half *addr, ga_half val) { +__device__ ga_half atom_add_eg(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, sum, new_; ga_half tmp; @@ -135,9 +135,9 @@ __device__ ga_half atom_add_hg(ga_half *addr, ga_half val) { tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); return tmp; } -#define atom_add_hl(a, b) atom_add_hg(a, b) +#define atom_add_el(a, b) atom_add_eg(a, b) -__device__ ga_half atom_xchg_hg(ga_half *addr, ga_half val) { +__device__ ga_half atom_xchg_eg(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, new_; ga_half tmp; @@ -150,4 +150,4 @@ __device__ ga_half atom_xchg_hg(ga_half *addr, ga_half val) { tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); return tmp; } -#define atom_xchg_hl(a, b) atom_xchg_hg(a, b) +#define atom_xchg_el(a, b) atom_xchg_eg(a, b) diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 9cefe7ce36..e70a5fe23f 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -384,7 +384,7 @@ static const char cluda_cuda_h[] = { 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, @@ -436,12 +436,12 @@ static const char cluda_cuda_h[] = { 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, +0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, 0x28, 0x67, 0x61, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, @@ -480,6 +480,6 @@ static const char cluda_cuda_h[] = { 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x68, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, +0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x00}; diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 0d384429d4..4f66444924 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -170,7 +170,7 @@ gen_atom64_xchg(atom_xchg_dl, ga_double, local) return o.h[idx]; \ } -gen_atomh_add(atom_add_hg, global) -gen_atomh_add(atom_add_hl, local) -gen_atomh_xchg(atom_xchg_hg, global) -gen_atomh_xchg(atom_xchg_hl, local) +gen_atomh_add(atom_add_eg, global) +gen_atomh_add(atom_add_el, local) +gen_atomh_xchg(atom_xchg_eg, global) +gen_atomh_xchg(atom_xchg_el, local) diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index b34a216bd8..9a184d94be 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -733,15 +733,15 @@ static const char cluda_opencl_h[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x68, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, +0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x68, 0x67, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x68, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, +0x67, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x00}; From 8dcb1867f3e163c1de913f4dc66b3c69738ebc84 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 16 Aug 2017 15:55:40 -0400 Subject: [PATCH 458/597] Small fixes --- pygpu/basic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pygpu/basic.py b/pygpu/basic.py index fe8a7bb1ea..51b92c5cc4 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -4,6 +4,7 @@ def _generate_kernel(ctx, cols, upper=True): tmpl = Template(""" + #include "cluda.h" KERNEL void extract_tri(GLOBAL_MEM ga_float *a, ga_size a_off, ga_uint N) { a = (GLOBAL_MEM ga_float *)(((char *)a) + a_off); unsigned int idx = GID_1 * LDIM_0 * GDIM_0 + From 2005304846344c5a61f011dafe5533e5cdb6b3bb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 16 Aug 2017 19:14:46 -0400 Subject: [PATCH 459/597] Fix pygpu tests. --- pygpu/tests/test_basic.py | 4 ++-- pygpu/tests/test_gpu_ndarray.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pygpu/tests/test_basic.py b/pygpu/tests/test_basic.py index 17361278b4..040ea228fa 100644 --- a/pygpu/tests/test_basic.py +++ b/pygpu/tests/test_basic.py @@ -63,14 +63,14 @@ def run_3d_tril(self): def run_noncontiguous_tril(self): a = numpy.random.rand(5, 5) - a = a[::-1] b = pygpu.array(a, context=context) + b = b[::-1] assert b.flags.c_contiguous is b.flags.f_contiguous is False tril(b) def run_noncontiguous_triu(self): a = numpy.random.rand(5, 5) - a = a[::-1] b = pygpu.array(a, context=context) + b = b[::-1] assert b.flags.c_contiguous is b.flags.f_contiguous is False triu(b) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index 562371a2fe..02e93ef001 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -811,8 +811,9 @@ def test_GpuContext(self): pickle.dumps(ctx, protocol=-1) def test_GpuKernel(self): - k = GpuKernel("KERNEL void nothing(GLOBAL_MEM ga_float *in) " - "{in[0] = 0;}", "nothing", [], context=ctx) + k = GpuKernel("#include \"cluda.h\"\nKERNEL void " + "k(GLOBAL_MEM ga_float *in)" + "{in[0] = 0;}", "k", [], context=ctx) with self.assertRaises(RuntimeError): pickle.dumps(k) with self.assertRaises(RuntimeError): From 9e8c4c0dfa11bb1e2143d04dbf0edfe5452dc2e7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 16 Aug 2017 20:43:22 -0400 Subject: [PATCH 460/597] Remove GpuArray_from_host_data since it is broken. --- pygpu/gpuarray.pxd | 13 ------------- pygpu/gpuarray.pyx | 35 +++++++++-------------------------- pygpu/tests/support.py | 3 ++- src/gpuarray/array.h | 5 ----- src/gpuarray_array.c | 36 ------------------------------------ 5 files changed, 11 insertions(+), 81 deletions(-) diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd index 5180f56d94..136ef52940 100644 --- a/pygpu/gpuarray.pxd +++ b/pygpu/gpuarray.pxd @@ -172,10 +172,6 @@ cdef extern from "gpuarray/array.h": gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writable) - int GpuArray_copy_from_host(_GpuArray *a, - gpucontext *ctx, void *buf, int typecode, - unsigned int nd, const size_t *dims, - const ssize_t *strides) nogil int GpuArray_view(_GpuArray *v, _GpuArray *a) int GpuArray_sync(_GpuArray *a) nogil int GpuArray_index(_GpuArray *r, _GpuArray *a, const ssize_t *starts, @@ -241,10 +237,6 @@ cdef int array_fromdata(GpuArray a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable) except -1 -cdef int array_copy_from_host(GpuArray a, - gpucontext *ctx, void *buf, int typecode, - unsigned int nd, const size_t *dims, - const ssize_t *strides) except -1 cdef int array_view(GpuArray v, GpuArray a) except -1 cdef int array_sync(GpuArray a) except -1 cdef int array_index(GpuArray r, GpuArray a, const ssize_t *starts, @@ -295,11 +287,6 @@ cdef api GpuArray pygpu_zeros(unsigned int nd, const size_t *dims, cdef api GpuArray pygpu_empty(unsigned int nd, const size_t *dims, int typecode, ga_order order, GpuContext context, object cls) -cdef api GpuArray pygpu_fromhostdata(void *buf, int typecode, unsigned int nd, - const size_t *dims, - const ssize_t *strides, - GpuContext context, object cls) - cdef api GpuArray pygpu_fromgpudata(gpudata *buf, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, GpuContext context, diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index d505e70d5a..2f8338adb1 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -308,17 +308,6 @@ cdef int array_fromdata(GpuArray a, if err != GA_NO_ERROR: raise get_exc(err), gpucontext_error(gpudata_context(data), err) -cdef int array_copy_from_host(GpuArray a, - gpucontext *ctx, void *buf, int typecode, - unsigned int nd, const size_t *dims, - const ssize_t *strides) except -1: - cdef int err - with nogil: - err = GpuArray_copy_from_host(&a.ga, ctx, buf, typecode, nd, dims, - strides); - if err != GA_NO_ERROR: - raise get_exc(err), gpucontext_error(ctx, err) - cdef int array_view(GpuArray v, GpuArray a) except -1: cdef int err err = GpuArray_view(&v.ga, &a.ga) @@ -702,17 +691,6 @@ cdef GpuArray pygpu_empty(unsigned int nd, const size_t *dims, int typecode, array_empty(res, context.ctx, typecode, nd, dims, order) return res -cdef GpuArray pygpu_fromhostdata(void *buf, int typecode, unsigned int nd, - const size_t *dims, const ssize_t *strides, - GpuContext context, object cls): - cdef GpuArray res - context = ensure_context(context) - - res = new_GpuArray(cls, context, None) - array_copy_from_host(res, context.ctx, buf, typecode, nd, - dims, strides) - return res - cdef GpuArray pygpu_fromgpudata(gpudata *buf, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, GpuContext context, @@ -984,7 +962,6 @@ cdef carray(proto, dtype, copy, order, unsigned int ndmin, cdef GpuArray arg cdef GpuArray tmp cdef np.ndarray a - cdef ga_order ord if isinstance(proto, GpuArray): arg = proto @@ -1028,12 +1005,18 @@ cdef carray(proto, dtype, copy, order, unsigned int ndmin, context = ensure_context(context) + # We need a contiguous array for the copy + if order != 'C' and order != 'F': + order = 'C' + a = numpy.array(proto, dtype=dtype_to_npdtype(dtype), order=order, ndmin=ndmin, copy=False) - return pygpu_fromhostdata(np.PyArray_DATA(a), dtype_to_typecode(a.dtype), - np.PyArray_NDIM(a), np.PyArray_DIMS(a), - np.PyArray_STRIDES(a), context, cls) + res = pygpu_empty(np.PyArray_NDIM(a), np.PyArray_DIMS(a), + dtype_to_typecode(a.dtype), to_ga_order(order), + context, cls) + array_write(res, np.PyArray_DATA(a), np.PyArray_NBYTES(a)) + return res cdef void (*cuda_enter)(gpucontext *) cdef void (*cuda_exit)(gpucontext *) diff --git a/pygpu/tests/support.py b/pygpu/tests/support.py index 611b68d24d..2eda88737f 100644 --- a/pygpu/tests/support.py +++ b/pygpu/tests/support.py @@ -137,10 +137,11 @@ def gen_gpuarray(shape_orig, dtype='float32', offseted_outer=False, a += incr a = numpy.asarray(a, dtype=dtype) + b = gpuarray.array(a, context=ctx, cls=cls) assert order in ['c', 'f'] if order == 'f' and len(shape) > 0: a = numpy.asfortranarray(a) - b = gpuarray.array(a, context=ctx, cls=cls) + b = gpuarray.asfortranarray(b) if order == 'f' and len(shape) > 0 and b.size > 1: assert b.flags['F_CONTIGUOUS'] diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index 7c659964cd..a99366a7c4 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -268,11 +268,6 @@ GPUARRAY_PUBLIC int GpuArray_fromdata(GpuArray *a, const size_t *dims, const ssize_t *strides, int writeable); -GPUARRAY_PUBLIC int GpuArray_copy_from_host(GpuArray *a, - gpucontext *ctx, void *buf, int typecode, - unsigned int nd, const size_t *dims, - const ssize_t *strides); - /** * Initialize an array structure to provide a view of another. * diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 68fc52da36..56f217a158 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -217,42 +217,6 @@ int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, return GA_NO_ERROR; } -int GpuArray_copy_from_host(GpuArray *a, gpucontext *ctx, void *buf, - int typecode, unsigned int nd, const size_t *dims, - const ssize_t *strides) { - char *base = (char *)buf; - size_t offset = 0; - size_t size = gpuarray_get_elsize(typecode); - gpudata *b; - int err; - unsigned int i; - - if (typecode == GA_SIZE || typecode == GA_SSIZE) - return error_set(ctx->err, GA_VALUE_ERROR, "Cannot create array with size type"); - - for (i = 0; i < nd; i++) { - if (dims[i] == 0) { - size = 0; - base = (char *)buf; - break; - } - - if (strides[i] < 0) - base += (dims[i]-1) * strides[i]; - else - size += (dims[i]-1) * strides[i]; - } - offset = (char *)buf - base; - size += offset; - - b = gpudata_alloc(ctx, size, base, GA_BUFFER_INIT, &err); - if (b == NULL) return err; - - err = GpuArray_fromdata(a, b, offset, typecode, nd, dims, strides, 1); - gpudata_release(b); - return err; -} - int GpuArray_view(GpuArray *v, const GpuArray *a) { gpucontext *ctx = GpuArray_context(a); v->data = a->data; From 2d12f1491a0e20759db786e5311fe0aad3f56fd3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 17 Aug 2017 13:01:46 -0400 Subject: [PATCH 461/597] Handle offset on output. --- pygpu/reduction.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pygpu/reduction.py b/pygpu/reduction.py index 8726006d19..2880ff375b 100644 --- a/pygpu/reduction.py +++ b/pygpu/reduction.py @@ -37,7 +37,8 @@ def _ceil_log2(x): #define REDUCE(a, b) (${reduce_expr}) -KERNEL void ${name}(const unsigned int n, ${out_arg.decltype()} out +KERNEL void ${name}(const unsigned int n, ${out_arg.decltype()} out, + const unsigned int out_off % for d in range(nd): , const unsigned int dim${d} % endfor @@ -64,6 +65,8 @@ def _ceil_log2(x): ${arg.name}_data = (${arg.decltype()})tmp; % endif % endfor + tmp = (GLOBAL_MEM char *)out; tmp += out_off; + out = (${out_arg.decltype()})tmp; i = GID_0; % for i in range(nd-1, -1, -1): @@ -125,6 +128,7 @@ def _ceil_log2(x): ldata[lid] = REDUCE(ldata[lid], ldata[lid+${cur_size}]); } % endwhile + local_barrier(); if (lid == 0) out[GID_0] = ldata[0]; } """) @@ -224,7 +228,7 @@ def _gen_basic(self, ls, nd): redux=self.redux, neutral=self.neutral, map_expr=self.expression) - spec = ['uint32', gpuarray.GpuArray] + spec = ['uint32', gpuarray.GpuArray, 'uint32'] spec.extend('uint32' for _ in range(nd)) for i, arg in enumerate(self.arguments): spec.append(arg.spec()) @@ -274,7 +278,7 @@ def __call__(self, *args, **kwargs): else: k, _, _, ls = self._get_basic_kernel(2**_ceil_log2(n), nd) - kargs = [n, out] + kargs = [n, out, out.offset] kargs.extend(dims) for i, arg in enumerate(args): kargs.append(arg) From 4de2eed5b3507edb1217e3d981fa2bd4d17035dc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 17 Aug 2017 13:41:16 -0400 Subject: [PATCH 462/597] Fix extcopy for float16. --- src/gpuarray_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 56f217a158..ef97bbd476 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -61,7 +61,7 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { gargs[1].name = "dst"; gargs[1].typecode = dst->typecode; gargs[1].flags = GE_WRITE; - k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, 0); + k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, GE_CONVERT_F16); if (k == NULL) return ctx->err->code; aa = memdup(&a, sizeof(a)); From 2b95688c2289580e1a40c92325309cc446485318 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 17 Aug 2017 15:38:17 -0400 Subject: [PATCH 463/597] Final test fixes. --- pygpu/tests/test_gpu_ndarray.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index 02e93ef001..a7fc03a611 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -100,8 +100,9 @@ def test_transfer_not_contiguous(): @guard_devsup def transfer_not_contiguous(shp, dtype): a = numpy.random.rand(*shp) * 10 - a = a[::-1] b = pygpu.array(a, context=ctx) + a = a[::-1] + b = b[::-1] c = numpy.asarray(b) assert numpy.allclose(c, a) @@ -121,11 +122,12 @@ def test_transfer_fortran(): @guard_devsup def transfer_fortran(shp, dtype): a = numpy.random.rand(*shp) * 10 + b = pygpu.array(a, context=ctx) a_ = numpy.asfortranarray(a) if len(shp) > 1: assert a_.strides != a.strides a = a_ - b = pygpu.array(a, context=ctx) + b = pygpu.asfortranarray(b) c = numpy.asarray(b) assert a.shape == b.shape == c.shape From 1e059f846ae6cb55da16072ca9ea153d912ededc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 17 Aug 2017 22:40:23 -0400 Subject: [PATCH 464/597] Don't use unions for aliasing on CUDA. --- src/cluda_cuda.h | 69 +++-- src/cluda_cuda.h.c | 637 ++++++++++++++++++++------------------------- 2 files changed, 321 insertions(+), 385 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index c32cbfd34c..197789e5b3 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -53,37 +53,6 @@ __device__ static inline ga_half ga_float2half(float f) { return r; } -#define gen_atom_add(name, argtype, wtype) \ - __device__ argtype name(argtype *addr, argtype val) { \ - union { \ - argtype a; \ - wtype w; \ - } p, n; \ - p.a = *addr; \ - do { \ - n.a = p.a + val; \ - p.w = atomicCAS((wtype *)addr, p.w, n.w); \ - } while (p.w != n.w); \ - return n.a; \ - } - -#define gen_atom32_add(name, argtype) gen_atom_add(name, argtype, unsigned int) -#define gen_atom64_add(name, argtype) gen_atom_add(name, argtype, unsigned long long) - -#define gen_atom_xchg(name, argtype, wtype) \ - __device__ argtype name(argtype *addr, argtype val) { \ - union { \ - argtype a; \ - wtype w; \ - } n, p; \ - n.a = val; \ - p.w = atomicExch((wtype *)addr, n.w); \ - return p.a; \ - } - -#define gen_atom32_xchg(name, argtype) gen_atom_xchg(name, argtype, unsigned int) -#define gen_atom64_xchg(name, argtype) gen_atom_xchg(name, argtype, unsigned long long) - /* ga_int */ #define atom_add_ig(a, b) atomicAdd(a, b) #define atom_add_il(a, b) atomicAdd(a, b) @@ -95,9 +64,22 @@ __device__ static inline ga_half ga_float2half(float f) { #define atom_xchg_Ig(a, b) atomicExch(a, b) #define atom_xchg_Il(a, b) atomicExch(a, b) /* ga_long */ -gen_atom64_add(atom_add_lg, ga_long) +__device__ ga_long atom_add_lg(ga_long *addr, ga_long val) { + unsigned long long *waddr = (unsigned long long *)addr; + unsigned long long old = *waddr; + unsigned long long assumed; + do { + assumed = old; + old = atomicCAS(waddr, assumed, (val + (ga_long)(assumed))); + } while (assumed != old); + return (ga_long)old; +} #define atom_add_ll(a, b) atom_add_lg(a, b) -gen_atom64_xchg(atom_xchg_lg, ga_long) +__device__ ga_long atom_xchg_lg(ga_long *addr, ga_long val) { + unsigned long long res; + res = atomicExch((unsigned long long *)addr, val); + return (ga_long)res; +} #define atom_xchg_ll(a, b) atom_xchg_lg(a, b) /* ga_ulong */ #define atom_add_Lg(a, b) atomicAdd(a, b) @@ -111,13 +93,26 @@ gen_atom64_xchg(atom_xchg_lg, ga_long) #define atom_xchg_fl(a, b) atomicExch(a, b) /* ga_double */ #if __CUDA_ARCH__ < 600 -gen_atom64_add(atom_add_dg, ga_double) +__device__ ga_double atom_add_dg(ga_double *addr, ga_double val) { + unsigned long long *waddr = (unsigned long long *)addr; + unsigned long long old = *waddr; + unsigned long long assumed; + do { + assumed = old; + old = atomicCAS(waddr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} #define atom_add_dl(a, b) atom_add_dg(a, b) #else -#define atom_add_dg(a, b, c) atomicAdd(a, b, c) -#define atom_add_dl(a, b, c) atomicAdd(a, b, c) +#define atom_add_dg(a, b) atomicAdd(a, b) +#define atom_add_dl(a, b) atomicAdd(a, b) #endif -gen_atom64_xchg(atom_xchg_dg, ga_double) +__device__ ga_double atom_xchg_dg(ga_double *addr, ga_double val) { + unsigned long long res; + res = atomicExch((unsigned long long *)addr, __double_as_longlong(val)); + return __longlong_as_double(res); +} #define atom_xchg_dl(a, b) atom_xchg_dg(a, b) /* ga_half */ __device__ ga_half atom_add_eg(ga_half *addr, ga_half val) { diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index e70a5fe23f..a46acbd716 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -124,362 +124,303 @@ static const char cluda_cuda_h[] = { 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x66, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, -0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, -0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, -0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, -0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, -0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, -0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, -0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, -0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, -0x53, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x29, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x70, 0x2e, 0x77, 0x2c, 0x20, -0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, -0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, -0x21, 0x3d, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, -0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, -0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, -0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, -0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, -0x77, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, -0x5f, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, -0x61, 0x6d, 0x65, 0x28, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, -0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x77, 0x74, 0x79, 0x70, 0x65, 0x20, 0x77, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6e, 0x2c, 0x20, 0x70, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, -0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x77, 0x74, 0x79, 0x70, -0x65, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, -0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, -0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x33, 0x32, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x75, 0x6e, -0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x20, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, -0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x2c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x29, -0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, -0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, -0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, -0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, -0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, -0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, -0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, -0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, -0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, -0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, +0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, +0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, +0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, +0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, +0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, +0x20, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, +0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, +0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, +0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, +0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, +0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, +0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, +0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, +0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, +0x72, 0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, -0x20, 0x63, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x2c, 0x20, 0x63, 0x29, 0x0a, 0x23, 0x65, 0x6e, -0x64, 0x69, 0x66, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, -0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, -0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, -0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, -0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, -0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, -0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, -0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, -0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, -0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, -0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, -0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, -0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, -0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, -0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, -0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, -0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, -0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, -0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, -0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, -0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, +0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, +0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, +0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, +0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, +0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, +0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, +0x20, 0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, +0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, +0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, +0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, +0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, +0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, +0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, +0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, +0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, +0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, +0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, +0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, +0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, +0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, +0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, +0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, +0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, +0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, -0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, -0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, -0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, -0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, -0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, -0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, -0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, -0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, -0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, -0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, -0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, +0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, +0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, +0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, +0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, +0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, +0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, +0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, +0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, +0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, +0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, +0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, +0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, +0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, +0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, +0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, +0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, +0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, +0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, +0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, +0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, +0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x00}; +0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, +0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, +0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, +0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, +0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, +0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, +0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, +0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, +0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, +0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, +0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, +0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, +0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, +0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, +0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x00}; From 6105ff2652cc83adea443303f736004f97b8e319 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 18 Aug 2017 12:36:52 -0400 Subject: [PATCH 465/597] Make head.py work on python2. --- src/head.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/head.py b/src/head.py index 2202ed542b..26cec125ea 100644 --- a/src/head.py +++ b/src/head.py @@ -18,7 +18,7 @@ def convert(src, dst): f.write(b'static const char %s[] = {\n' % (src_name.encode('utf-8'),)) first = True n = 0 - for b in src_data: + for b in bytearray(src_data): if b == 0: raise ValueError('NUL in file') if first: @@ -29,7 +29,6 @@ def convert(src, dst): wrt(f, n, 0) f.write(b'};\n') - if __name__ == '__main__': import sys convert(sys.argv[1], sys.argv[1] + '.c') From 1c2ba1b796f23ad0a56e0157213e8a75c5d0ea25 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 18 Aug 2017 13:14:48 -0400 Subject: [PATCH 466/597] Install mako on appveyor to be able to use the generation scripts. --- .appveyor.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index f419bc5480..2018f0d80a 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: '0.6.9.{build}' +version: '0.7.0.{build}' pull_requests: do_not_increment_build_number: true @@ -26,6 +26,9 @@ environment: - PYTHON: "C:\\Python35" VS_PATH: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC" +install: + - python -m pip install mako cython + build_script: - echo "Python:" "%PYTHON%" - echo "Config:" "%CONFIGURATION%" From 7f89cec6625beb21c539889a615cadaa6d131de5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 18 Aug 2017 16:57:31 -0400 Subject: [PATCH 467/597] Add some math constants that are provided by OpenCL. --- src/cluda_cuda.h | 17 + src/cluda_cuda.h.c | 751 ++++++++++++++++++++++++--------------------- 2 files changed, 425 insertions(+), 343 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index 197789e5b3..f02ef682af 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -4,6 +4,7 @@ #define GLOBAL_MEM /* empty */ #define LOCAL_MEM __shared__ #define LOCAL_MEM_ARG /* empty */ +#define MAXFLOAT 3.402823466E+38F #ifdef NAN #undef NAN #endif @@ -13,6 +14,22 @@ #undef INFINITY #endif #define INFINITY __int_as_float(0x7f800000) +#define HUGE_VALF INFINITY +#define HUGE_VAL __longlong_as_double(0x7ff0000000000000) + +#define M_E 2.7182818284590452354 +#define M_LOG2E 1.4426950408889634074 +#define M_LOG10E 0.43429448190325182765 +#define M_LN2 0.69314718055994530942 +#define M_LN10 2.30258509299404568402 +#define M_PI 3.14159265358979323846 +#define M_PI_2 1.57079632679489661923 +#define M_PI_4 0.78539816339744830962 +#define M_1_PI 0.31830988618379067154 +#define M_2_PI 0.63661977236758134308 +#define M_2_SQRTPI 1.12837916709551257390 +#define M_SQRT2 1.41421356237309504880 +#define M_SQRT1_2 0.70710678118654752440 #define LID_0 threadIdx.x #define LID_1 threadIdx.y #define LID_2 threadIdx.z diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index a46acbd716..58a8f65d68 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -18,153 +18,310 @@ static const char cluda_cuda_h[] = { 0x65, 0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, -0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, -0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, -0x66, 0x20, 0x4e, 0x41, 0x4e, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, -0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, -0x41, 0x4e, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, -0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, -0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x4e, 0x55, 0x4c, 0x4c, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, -0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, -0x59, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, -0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x65, 0x6e, 0x64, -0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x5f, 0x5f, -0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x38, 0x30, 0x30, 0x30, 0x30, -0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x4c, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, -0x64, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x74, -0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, +0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x4d, 0x41, 0x58, 0x46, 0x4c, 0x4f, 0x41, 0x54, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x34, +0x30, 0x32, 0x38, 0x32, 0x33, 0x34, 0x36, 0x36, 0x45, 0x2b, 0x33, +0x38, 0x46, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x4e, +0x41, 0x4e, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, +0x41, 0x4e, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, +0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, +0x66, 0x66, 0x66, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x55, 0x4c, +0x4c, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, +0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, +0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, +0x49, 0x54, 0x59, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x49, 0x4e, 0x46, +0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, +0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, +0x78, 0x37, 0x66, 0x38, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x48, 0x55, 0x47, +0x45, 0x5f, 0x56, 0x41, 0x4c, 0x46, 0x20, 0x49, 0x4e, 0x46, 0x49, +0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x48, 0x55, 0x47, 0x45, 0x5f, 0x56, 0x41, 0x4c, 0x20, +0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, +0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x30, +0x78, 0x37, 0x66, 0x66, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, +0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x45, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, +0x2e, 0x37, 0x31, 0x38, 0x32, 0x38, 0x31, 0x38, 0x32, 0x38, 0x34, +0x35, 0x39, 0x30, 0x34, 0x35, 0x32, 0x33, 0x35, 0x34, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4f, +0x47, 0x32, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x31, 0x2e, 0x34, 0x34, 0x32, 0x36, 0x39, 0x35, 0x30, 0x34, 0x30, +0x38, 0x38, 0x38, 0x39, 0x36, 0x33, 0x34, 0x30, 0x37, 0x34, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, +0x4f, 0x47, 0x31, 0x30, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x30, 0x2e, 0x34, 0x33, 0x34, 0x32, 0x39, 0x34, 0x34, 0x38, +0x31, 0x39, 0x30, 0x33, 0x32, 0x35, 0x31, 0x38, 0x32, 0x37, 0x36, +0x35, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, +0x5f, 0x4c, 0x4e, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x39, 0x33, 0x31, 0x34, 0x37, +0x31, 0x38, 0x30, 0x35, 0x35, 0x39, 0x39, 0x34, 0x35, 0x33, 0x30, +0x39, 0x34, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4d, 0x5f, 0x4c, 0x4e, 0x31, 0x30, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x2e, 0x33, 0x30, 0x32, 0x35, +0x38, 0x35, 0x30, 0x39, 0x32, 0x39, 0x39, 0x34, 0x30, 0x34, 0x35, +0x36, 0x38, 0x34, 0x30, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x31, 0x34, +0x31, 0x35, 0x39, 0x32, 0x36, 0x35, 0x33, 0x35, 0x38, 0x39, 0x37, +0x39, 0x33, 0x32, 0x33, 0x38, 0x34, 0x36, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f, 0x32, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, +0x35, 0x37, 0x30, 0x37, 0x39, 0x36, 0x33, 0x32, 0x36, 0x37, 0x39, +0x34, 0x38, 0x39, 0x36, 0x36, 0x31, 0x39, 0x32, 0x33, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, +0x5f, 0x34, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x30, 0x2e, 0x37, 0x38, 0x35, 0x33, 0x39, 0x38, 0x31, 0x36, 0x33, +0x33, 0x39, 0x37, 0x34, 0x34, 0x38, 0x33, 0x30, 0x39, 0x36, 0x32, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, +0x31, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x30, 0x2e, 0x33, 0x31, 0x38, 0x33, 0x30, 0x39, 0x38, +0x38, 0x36, 0x31, 0x38, 0x33, 0x37, 0x39, 0x30, 0x36, 0x37, 0x31, +0x35, 0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x4d, 0x5f, 0x32, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x33, 0x36, 0x36, 0x31, +0x39, 0x37, 0x37, 0x32, 0x33, 0x36, 0x37, 0x35, 0x38, 0x31, 0x33, +0x34, 0x33, 0x30, 0x38, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x4d, 0x5f, 0x32, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x50, +0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x31, 0x32, 0x38, +0x33, 0x37, 0x39, 0x31, 0x36, 0x37, 0x30, 0x39, 0x35, 0x35, 0x31, +0x32, 0x35, 0x37, 0x33, 0x39, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x32, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x34, +0x31, 0x34, 0x32, 0x31, 0x33, 0x35, 0x36, 0x32, 0x33, 0x37, 0x33, +0x30, 0x39, 0x35, 0x30, 0x34, 0x38, 0x38, 0x30, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, +0x54, 0x31, 0x5f, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, +0x2e, 0x37, 0x30, 0x37, 0x31, 0x30, 0x36, 0x37, 0x38, 0x31, 0x31, +0x38, 0x36, 0x35, 0x34, 0x37, 0x35, 0x32, 0x34, 0x34, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, -0x5f, 0x32, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, -0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, -0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, -0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x79, +0x5f, 0x30, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, +0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x74, 0x68, 0x72, 0x65, +0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, +0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, -0x49, 0x4d, 0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, -0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, -0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, -0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, -0x5f, 0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, -0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x72, 0x69, 0x64, -0x44, 0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, -0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, -0x32, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x7a, +0x49, 0x4d, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, +0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x62, 0x6c, +0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, +0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, +0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, +0x49, 0x44, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, +0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, +0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, +0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, +0x4d, 0x5f, 0x30, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, +0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x72, 0x69, 0x64, +0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, +0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, +0x6f, 0x6c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, +0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, -0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, -0x74, 0x65, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, -0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, -0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, -0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, -0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, -0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x73, 0x68, 0x6f, -0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, +0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, +0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, -0x65, 0x64, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, +0x69, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, +0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, -0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x20, 0x73, 0x69, 0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, -0x7a, 0x65, 0x20, 0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, -0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, -0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, -0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, -0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, +0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, +0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, -0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, -0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, -0x6e, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, -0x5f, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, -0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, -0x5a, 0x45, 0x20, 0x77, 0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, -0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, -0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, -0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x28, 0x28, 0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, -0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, -0x20, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, -0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x2e, 0x64, -0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, -0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, -0x66, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, +0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, +0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, +0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, +0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, +0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, +0x77, 0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, +0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, +0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, +0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x5f, 0x5f, +0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x73, 0x74, +0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, +0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, +0x28, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x72, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, +0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, +0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x66, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, +0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, +0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, +0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, +0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, +0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, +0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, +0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, +0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x28, 0x76, +0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, +0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, +0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, 0x6c, 0x64, 0x3b, +0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, +0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, +0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, +0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, +0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, +0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x72, 0x65, 0x73, +0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, -0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, +0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, +0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, +0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, +0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, +0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, +0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, @@ -182,225 +339,90 @@ static const char cluda_cuda_h[] = { 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, -0x20, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, -0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, -0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, -0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, -0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, +0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, +0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, +0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, +0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x6f, 0x6c, +0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, -0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, -0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, -0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, -0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, -0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, -0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, -0x72, 0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, -0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, -0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, -0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, -0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, -0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, -0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, -0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, -0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, -0x20, 0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, -0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, +0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, +0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, -0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, -0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, -0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, -0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, -0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, +0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, +0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, +0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, +0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, 0x29, 0x3b, 0x0a, +0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, +0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, +0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, +0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, +0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, +0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, +0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, -0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, -0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, -0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, -0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, -0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, -0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, -0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, -0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, -0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, -0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, -0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, -0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, -0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, -0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, -0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, -0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, -0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, -0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, -0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, -0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, -0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, -0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, -0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, -0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, -0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, -0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, -0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, -0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, -0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, -0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, -0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, -0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, -0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, -0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, -0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, -0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, -0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, -0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, -0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, -0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, -0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, +0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, +0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, +0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, +0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, +0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, +0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, +0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, +0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, +0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, @@ -420,7 +442,50 @@ static const char cluda_cuda_h[] = { 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x00}; +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, +0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, +0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, +0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, +0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, +0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, +0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, +0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, +0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, +0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, +0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, +0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, +0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x00}; From d35e1503f7c6c2ad80613adbc866dd0c12dd123e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 18 Aug 2017 17:35:04 -0400 Subject: [PATCH 468/597] Set the default of max_cache_size to effectively infinite. --- pygpu/gpuarray.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 2f8338adb1..301cd2e058 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -6,6 +6,8 @@ from libc.string cimport strncmp cimport numpy as np import numpy as np +import sys + from cpython cimport Py_INCREF, PyNumber_Index from cpython.object cimport Py_EQ, Py_NE @@ -586,10 +588,10 @@ cdef GpuContext pygpu_init(dev, gpucontext_props *p): return res def init(dev, sched='default', single_stream=False, kernel_cache_path=None, - max_cache_size=0, initial_cache_size=0): + max_cache_size=sys.maxsize, initial_cache_size=0): """ init(dev, sched='default', single_stream=False, kernel_cache_path=None, - max_cache_size=0, initial_cache_size=0) + max_cache_size=sys.maxsize, initial_cache_size=0) Creates a context from a device specifier. From 4db646e9636aff34ee49b3e0d92b626966a8547b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 18 Aug 2017 18:02:55 -0400 Subject: [PATCH 469/597] Changes from review. --- src/cluda_cuda.h | 3 +++ src/cluda_opencl.h | 4 ++++ src/gpuarray_buffer.c | 14 ++++++++------ src/gpuarray_buffer_cuda.c | 7 +++++-- src/head.py | 2 ++ 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index f02ef682af..7e8cc201b9 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -1,3 +1,5 @@ +#ifndef CLUDA_H +#define CLUDA_H #define local_barrier() __syncthreads() #define WITHIN_KERNEL extern "C" __device__ #define KERNEL extern "C" __global__ @@ -163,3 +165,4 @@ __device__ ga_half atom_xchg_eg(ga_half *addr, ga_half val) { return tmp; } #define atom_xchg_el(a, b) atom_xchg_eg(a, b) +#endif diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 4f66444924..18d65528d7 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -1,3 +1,5 @@ +#ifndef CLUDA_H +#define CLUDA_H #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE) #define WITHIN_KERNEL /* empty */ #define KERNEL __kernel @@ -174,3 +176,5 @@ gen_atomh_add(atom_add_eg, global) gen_atomh_add(atom_add_el, local) gen_atomh_xchg(atom_xchg_eg, global) gen_atomh_xchg(atom_xchg_el, local) + +#endif diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index 537cd53e6e..e38c19795f 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -61,20 +61,22 @@ int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno) { } int gpucontext_props_sched(gpucontext_props *p, int sched) { - if (sched == GA_CTX_SCHED_MULTI) - FLSET(p->flags, GA_CTX_MULTI_THREAD); - else - FLCLR(p->flags, GA_CTX_MULTI_THREAD); - switch (sched) { case GA_CTX_SCHED_MULTI: case GA_CTX_SCHED_AUTO: case GA_CTX_SCHED_SINGLE: p->sched = sched; - return GA_NO_ERROR; + break; default: return error_fmt(global_err, GA_INVALID_ERROR, "Invalid value for sched: %d", sched); } + + if (sched == GA_CTX_SCHED_MULTI) + FLSET(p->flags, GA_CTX_MULTI_THREAD); + else + FLCLR(p->flags, GA_CTX_MULTI_THREAD); + + return GA_NO_ERROR; } int gpucontext_props_set_single_stream(gpucontext_props *p) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 86e7b4d2cb..cdbe9278b2 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -322,8 +322,11 @@ cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p) { } res->errbuf->flags |= CUDA_MAPPED_PTR; /* Prime the cache */ - if (p->initial_cache_size) - cuda_alloc((gpucontext *)res, p->initial_cache_size, NULL, 0); + if (p->initial_cache_size) { + gpudata *tmp = cuda_alloc((gpucontext *)res, p->initial_cache_size, NULL, 0); + if (tmp != NULL) + cuda_free(tmp); + } return res; fail_end: cuMemFreeHost(pp); diff --git a/src/head.py b/src/head.py index 26cec125ea..04b1b5dad3 100644 --- a/src/head.py +++ b/src/head.py @@ -1,3 +1,5 @@ +# Used to generate the string tables to embed the cluda headers. + def wrt(f, n, b): f.write(b',') n += 1 From 2740c253032e09d94125b5970c1351ce422aad6d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 21 Aug 2017 14:47:39 -0400 Subject: [PATCH 470/597] Final compile fix. --- pygpu/gpuarray.pyx | 2 +- src/cluda_cuda.h.c | 930 ++++++++++++++++----------------- src/cluda_opencl.h.c | 1001 ++++++++++++++++++------------------ src/gpuarray_buffer_cuda.c | 2 +- 4 files changed, 971 insertions(+), 964 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 301cd2e058..44608e1e4f 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -642,7 +642,7 @@ def init(dev, sched='default', single_stream=False, kernel_cache_path=None, if kernel_cache_path: kernel_cache_path_b = _s(kernel_cache_path) gpucontext_props_kernel_cache(p, kernel_cache_path_b) - gpucontext_props_alloc_cache(p, max_cache_size, initial_cache_size) + gpucontext_props_alloc_cache(p, initial_cache_size, max_cache_size) if single_stream: gpucontext_props_set_single_stream(p); except: diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 58a8f65d68..797cef438c 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -1,491 +1,495 @@ static const char cluda_cuda_h[] = { -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, -0x61, 0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, -0x29, 0x20, 0x5f, 0x5f, 0x73, 0x79, 0x6e, 0x63, 0x74, 0x68, 0x72, -0x65, 0x61, 0x64, 0x73, 0x28, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, 0x5f, -0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, -0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f, 0x5f, 0x64, 0x65, -0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, -0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, -0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x5f, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f, -0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x2f, 0x2a, 0x20, -0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, -0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, -0x65, 0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, -0x5f, 0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, -0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x4d, 0x41, 0x58, 0x46, 0x4c, 0x4f, 0x41, 0x54, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x34, -0x30, 0x32, 0x38, 0x32, 0x33, 0x34, 0x36, 0x36, 0x45, 0x2b, 0x33, -0x38, 0x46, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x4e, -0x41, 0x4e, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, -0x41, 0x4e, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, -0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, -0x66, 0x66, 0x66, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x55, 0x4c, -0x4c, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, -0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, -0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, -0x49, 0x54, 0x59, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x49, 0x4e, 0x46, -0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, -0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, -0x78, 0x37, 0x66, 0x38, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x48, 0x55, 0x47, -0x45, 0x5f, 0x56, 0x41, 0x4c, 0x46, 0x20, 0x49, 0x4e, 0x46, 0x49, -0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x48, 0x55, 0x47, 0x45, 0x5f, 0x56, 0x41, 0x4c, 0x20, -0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, -0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x30, -0x78, 0x37, 0x66, 0x66, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, -0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x45, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, -0x2e, 0x37, 0x31, 0x38, 0x32, 0x38, 0x31, 0x38, 0x32, 0x38, 0x34, -0x35, 0x39, 0x30, 0x34, 0x35, 0x32, 0x33, 0x35, 0x34, 0x0a, 0x23, +0x23, 0x69, 0x66, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x43, 0x4c, 0x55, +0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61, +0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x29, +0x20, 0x5f, 0x5f, 0x73, 0x79, 0x6e, 0x63, 0x74, 0x68, 0x72, 0x65, +0x61, 0x64, 0x73, 0x28, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, 0x5f, 0x4b, +0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, +0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f, 0x5f, 0x64, 0x65, 0x76, +0x69, 0x63, 0x65, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x65, +0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x22, 0x43, 0x22, 0x20, 0x5f, +0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x5f, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f, 0x42, +0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x2f, 0x2a, 0x20, 0x65, +0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, +0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, +0x64, 0x5f, 0x5f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, +0x41, 0x52, 0x47, 0x20, 0x2f, 0x2a, 0x20, 0x65, 0x6d, 0x70, 0x74, +0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x4d, 0x41, 0x58, 0x46, 0x4c, 0x4f, 0x41, 0x54, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x34, 0x30, +0x32, 0x38, 0x32, 0x33, 0x34, 0x36, 0x36, 0x45, 0x2b, 0x33, 0x38, +0x46, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, +0x4e, 0x0a, 0x23, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x4e, 0x41, +0x4e, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x41, 0x4e, 0x20, 0x5f, +0x5f, 0x69, 0x6e, 0x74, 0x5f, 0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x28, 0x30, 0x78, 0x37, 0x66, 0x66, 0x66, 0x66, 0x66, +0x66, 0x66, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x4e, 0x55, 0x4c, 0x4c, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, +0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x0a, 0x23, 0x75, +0x6e, 0x64, 0x65, 0x66, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, +0x54, 0x59, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x49, 0x4e, 0x46, 0x49, +0x4e, 0x49, 0x54, 0x59, 0x20, 0x5f, 0x5f, 0x69, 0x6e, 0x74, 0x5f, +0x61, 0x73, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x30, 0x78, +0x37, 0x66, 0x38, 0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x48, 0x55, 0x47, 0x45, +0x5f, 0x56, 0x41, 0x4c, 0x46, 0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, +0x49, 0x54, 0x59, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x48, 0x55, 0x47, 0x45, 0x5f, 0x56, 0x41, 0x4c, 0x20, 0x5f, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, +0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x30, 0x78, +0x37, 0x66, 0x66, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, +0x30, 0x30, 0x30, 0x30, 0x30, 0x29, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x45, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x2e, +0x37, 0x31, 0x38, 0x32, 0x38, 0x31, 0x38, 0x32, 0x38, 0x34, 0x35, +0x39, 0x30, 0x34, 0x35, 0x32, 0x33, 0x35, 0x34, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4f, 0x47, +0x32, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, +0x2e, 0x34, 0x34, 0x32, 0x36, 0x39, 0x35, 0x30, 0x34, 0x30, 0x38, +0x38, 0x38, 0x39, 0x36, 0x33, 0x34, 0x30, 0x37, 0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, 0x4f, -0x47, 0x32, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x31, 0x2e, 0x34, 0x34, 0x32, 0x36, 0x39, 0x35, 0x30, 0x34, 0x30, -0x38, 0x38, 0x38, 0x39, 0x36, 0x33, 0x34, 0x30, 0x37, 0x34, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x4c, -0x4f, 0x47, 0x31, 0x30, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x30, 0x2e, 0x34, 0x33, 0x34, 0x32, 0x39, 0x34, 0x34, 0x38, -0x31, 0x39, 0x30, 0x33, 0x32, 0x35, 0x31, 0x38, 0x32, 0x37, 0x36, -0x35, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, -0x5f, 0x4c, 0x4e, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x39, 0x33, 0x31, 0x34, 0x37, -0x31, 0x38, 0x30, 0x35, 0x35, 0x39, 0x39, 0x34, 0x35, 0x33, 0x30, -0x39, 0x34, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4d, 0x5f, 0x4c, 0x4e, 0x31, 0x30, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x32, 0x2e, 0x33, 0x30, 0x32, 0x35, -0x38, 0x35, 0x30, 0x39, 0x32, 0x39, 0x39, 0x34, 0x30, 0x34, 0x35, -0x36, 0x38, 0x34, 0x30, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x31, 0x34, -0x31, 0x35, 0x39, 0x32, 0x36, 0x35, 0x33, 0x35, 0x38, 0x39, 0x37, -0x39, 0x33, 0x32, 0x33, 0x38, 0x34, 0x36, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f, 0x32, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, -0x35, 0x37, 0x30, 0x37, 0x39, 0x36, 0x33, 0x32, 0x36, 0x37, 0x39, -0x34, 0x38, 0x39, 0x36, 0x36, 0x31, 0x39, 0x32, 0x33, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, -0x5f, 0x34, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x30, 0x2e, 0x37, 0x38, 0x35, 0x33, 0x39, 0x38, 0x31, 0x36, 0x33, -0x33, 0x39, 0x37, 0x34, 0x34, 0x38, 0x33, 0x30, 0x39, 0x36, 0x32, +0x47, 0x31, 0x30, 0x45, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x30, 0x2e, 0x34, 0x33, 0x34, 0x32, 0x39, 0x34, 0x34, 0x38, 0x31, +0x39, 0x30, 0x33, 0x32, 0x35, 0x31, 0x38, 0x32, 0x37, 0x36, 0x35, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, -0x31, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x30, 0x2e, 0x33, 0x31, 0x38, 0x33, 0x30, 0x39, 0x38, -0x38, 0x36, 0x31, 0x38, 0x33, 0x37, 0x39, 0x30, 0x36, 0x37, 0x31, -0x35, 0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x4d, 0x5f, 0x32, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x33, 0x36, 0x36, 0x31, -0x39, 0x37, 0x37, 0x32, 0x33, 0x36, 0x37, 0x35, 0x38, 0x31, 0x33, -0x34, 0x33, 0x30, 0x38, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x4d, 0x5f, 0x32, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x50, -0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x31, 0x32, 0x38, -0x33, 0x37, 0x39, 0x31, 0x36, 0x37, 0x30, 0x39, 0x35, 0x35, 0x31, -0x32, 0x35, 0x37, 0x33, 0x39, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x32, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x34, -0x31, 0x34, 0x32, 0x31, 0x33, 0x35, 0x36, 0x32, 0x33, 0x37, 0x33, -0x30, 0x39, 0x35, 0x30, 0x34, 0x38, 0x38, 0x30, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, -0x54, 0x31, 0x5f, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, -0x2e, 0x37, 0x30, 0x37, 0x31, 0x30, 0x36, 0x37, 0x38, 0x31, 0x31, -0x38, 0x36, 0x35, 0x34, 0x37, 0x35, 0x32, 0x34, 0x34, 0x30, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, -0x5f, 0x30, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, -0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x74, 0x68, 0x72, 0x65, -0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, -0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x7a, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, -0x49, 0x4d, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, -0x69, 0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x62, 0x6c, -0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, -0x32, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, -0x7a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, -0x49, 0x44, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, -0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, -0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, -0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, -0x4d, 0x5f, 0x30, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, +0x4c, 0x4e, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x30, 0x2e, 0x36, 0x39, 0x33, 0x31, 0x34, 0x37, 0x31, +0x38, 0x30, 0x35, 0x35, 0x39, 0x39, 0x34, 0x35, 0x33, 0x30, 0x39, +0x34, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x4d, 0x5f, 0x4c, 0x4e, 0x31, 0x30, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x32, 0x2e, 0x33, 0x30, 0x32, 0x35, 0x38, +0x35, 0x30, 0x39, 0x32, 0x39, 0x39, 0x34, 0x30, 0x34, 0x35, 0x36, +0x38, 0x34, 0x30, 0x32, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x33, 0x2e, 0x31, 0x34, 0x31, +0x35, 0x39, 0x32, 0x36, 0x35, 0x33, 0x35, 0x38, 0x39, 0x37, 0x39, +0x33, 0x32, 0x33, 0x38, 0x34, 0x36, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f, 0x32, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x35, +0x37, 0x30, 0x37, 0x39, 0x36, 0x33, 0x32, 0x36, 0x37, 0x39, 0x34, +0x38, 0x39, 0x36, 0x36, 0x31, 0x39, 0x32, 0x33, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x50, 0x49, 0x5f, +0x34, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, +0x2e, 0x37, 0x38, 0x35, 0x33, 0x39, 0x38, 0x31, 0x36, 0x33, 0x33, +0x39, 0x37, 0x34, 0x34, 0x38, 0x33, 0x30, 0x39, 0x36, 0x32, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x31, +0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x30, 0x2e, 0x33, 0x31, 0x38, 0x33, 0x30, 0x39, 0x38, 0x38, +0x36, 0x31, 0x38, 0x33, 0x37, 0x39, 0x30, 0x36, 0x37, 0x31, 0x35, +0x34, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, +0x5f, 0x32, 0x5f, 0x50, 0x49, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x30, 0x2e, 0x36, 0x33, 0x36, 0x36, 0x31, 0x39, +0x37, 0x37, 0x32, 0x33, 0x36, 0x37, 0x35, 0x38, 0x31, 0x33, 0x34, +0x33, 0x30, 0x38, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4d, 0x5f, 0x32, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x50, 0x49, +0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x31, 0x32, 0x38, 0x33, +0x37, 0x39, 0x31, 0x36, 0x37, 0x30, 0x39, 0x35, 0x35, 0x31, 0x32, +0x35, 0x37, 0x33, 0x39, 0x30, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54, 0x32, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x31, 0x2e, 0x34, 0x31, +0x34, 0x32, 0x31, 0x33, 0x35, 0x36, 0x32, 0x33, 0x37, 0x33, 0x30, +0x39, 0x35, 0x30, 0x34, 0x38, 0x38, 0x30, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4d, 0x5f, 0x53, 0x51, 0x52, 0x54, +0x31, 0x5f, 0x32, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x2e, +0x37, 0x30, 0x37, 0x31, 0x30, 0x36, 0x37, 0x38, 0x31, 0x31, 0x38, +0x36, 0x35, 0x34, 0x37, 0x35, 0x32, 0x34, 0x34, 0x30, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, +0x30, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x72, 0x69, 0x64, -0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, -0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, -0x6f, 0x6c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, -0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, -0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, -0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, -0x68, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, -0x67, 0x6e, 0x65, 0x64, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, +0x4c, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x74, 0x68, 0x72, 0x65, 0x61, +0x64, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x74, +0x68, 0x72, 0x65, 0x61, 0x64, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, +0x4d, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, +0x6d, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, +0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, +0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x44, 0x69, 0x6d, 0x2e, 0x7a, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, +0x44, 0x5f, 0x30, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, +0x78, 0x2e, 0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x47, 0x49, 0x44, 0x5f, 0x31, 0x20, 0x62, 0x6c, 0x6f, 0x63, +0x6b, 0x49, 0x64, 0x78, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x62, +0x6c, 0x6f, 0x63, 0x6b, 0x49, 0x64, 0x78, 0x2e, 0x7a, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, +0x5f, 0x30, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, +0x78, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, +0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x72, 0x69, 0x64, 0x44, +0x69, 0x6d, 0x2e, 0x79, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x72, +0x69, 0x64, 0x44, 0x69, 0x6d, 0x2e, 0x7a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, +0x6c, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x73, +0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x69, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, -0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, +0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, +0x6f, 0x72, 0x74, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, +0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, +0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x69, +0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, -0x7a, 0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, -0x70, 0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, -0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, -0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, -0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, -0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, -0x5f, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, -0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, -0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, -0x77, 0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, -0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, -0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, -0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x5f, 0x5f, -0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x73, 0x74, -0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, -0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, -0x28, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x72, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, -0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, -0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x66, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, -0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x73, 0x69, 0x7a, +0x65, 0x5f, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x70, +0x74, 0x72, 0x64, 0x69, 0x66, 0x66, 0x5f, 0x74, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, +0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, +0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, +0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, +0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, +0x29, 0x20, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x5f, 0x5f, +0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x5f, 0x20, 0x74, 0x79, +0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, +0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x77, +0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, 0x74, +0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, +0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, +0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, 0x70, +0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x5f, 0x5f, 0x64, +0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x73, 0x74, 0x61, +0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, +0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, +0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x66, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, +0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, -0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, -0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, -0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, -0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, -0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, -0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, -0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, -0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, -0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, -0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x28, 0x76, -0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, -0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, -0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, 0x6c, 0x64, 0x3b, -0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, +0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, +0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, -0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, +0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, -0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, -0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, -0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x72, 0x65, 0x73, -0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, -0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, -0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, -0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, -0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, -0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, -0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, -0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, -0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, -0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, -0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, -0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, -0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, -0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, -0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, -0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, -0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, -0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, -0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, -0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x6f, 0x6c, -0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, -0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, -0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, -0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, -0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, +0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, +0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, -0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, -0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, -0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, 0x29, 0x3b, 0x0a, +0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, +0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, +0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, +0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, +0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, +0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x28, 0x76, 0x61, +0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, +0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, +0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, -0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, -0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, -0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, +0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, +0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, +0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x72, 0x65, 0x73, 0x3b, +0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, +0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, +0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, +0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, +0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, +0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, +0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, +0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, +0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, +0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, -0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, -0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, -0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, -0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, -0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, -0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, -0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, -0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, -0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, -0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, -0x72, 0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, -0x74, 0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, -0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, -0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, -0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, -0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, -0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, -0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, -0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, -0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, -0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, -0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, -0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, -0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, -0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, -0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, -0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, -0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, -0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, -0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, -0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, -0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, -0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, -0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, -0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, -0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, -0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, +0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, +0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, +0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x6f, 0x6c, 0x64, +0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, +0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, +0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, +0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, +0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, +0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, +0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, 0x29, 0x3b, 0x0a, 0x7d, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, +0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, +0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, +0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, +0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, +0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, +0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, +0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, +0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, -0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, -0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x00}; +0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, +0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, +0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, +0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, +0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, +0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, +0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, +0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, +0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, +0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, +0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, +0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, +0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, +0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, +0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, +0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, +0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, +0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, +0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, +0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, +0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, +0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, +0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, +0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, +0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, +0x64, 0x69, 0x66, 0x0a, 0x00}; diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index 9a184d94be..54d2b4eff4 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -1,747 +1,750 @@ static const char cluda_opencl_h[] = { -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, -0x61, 0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, -0x29, 0x20, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x43, -0x4c, 0x4b, 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, -0x4d, 0x5f, 0x46, 0x45, 0x4e, 0x43, 0x45, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, -0x4e, 0x5f, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x2f, 0x2a, -0x20, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, -0x45, 0x4c, 0x20, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, -0x4f, 0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, -0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, -0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, -0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, -0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x2f, 0x2a, 0x20, -0x4e, 0x41, 0x4e, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x6e, -0x64, 0x65, 0x66, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x0a, 0x20, 0x20, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x55, 0x4c, -0x4c, 0x20, 0x28, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x2a, 0x29, 0x30, -0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, -0x20, 0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x2a, -0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, -0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, -0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, -0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, -0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, -0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, -0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, -0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, -0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, -0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, -0x44, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, -0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, -0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, -0x69, 0x64, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, -0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, -0x70, 0x73, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, -0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, -0x70, 0x73, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, -0x65, 0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, -0x70, 0x73, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, -0x75, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, +0x23, 0x69, 0x66, 0x6e, 0x64, 0x65, 0x66, 0x20, 0x43, 0x4c, 0x55, +0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x43, 0x4c, 0x55, 0x44, 0x41, 0x5f, 0x48, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x6c, 0x6f, 0x63, 0x61, +0x6c, 0x5f, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x29, +0x20, 0x62, 0x61, 0x72, 0x72, 0x69, 0x65, 0x72, 0x28, 0x43, 0x4c, +0x4b, 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, +0x5f, 0x46, 0x45, 0x4e, 0x43, 0x45, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x57, 0x49, 0x54, 0x48, 0x49, 0x4e, +0x5f, 0x4b, 0x45, 0x52, 0x4e, 0x45, 0x4c, 0x20, 0x2f, 0x2a, 0x20, +0x65, 0x6d, 0x70, 0x74, 0x79, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4b, 0x45, 0x52, 0x4e, 0x45, +0x4c, 0x20, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x4c, 0x4f, +0x42, 0x41, 0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x20, 0x5f, 0x5f, 0x67, +0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, 0x4c, 0x5f, 0x4d, 0x45, +0x4d, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x4f, 0x43, 0x41, +0x4c, 0x5f, 0x4d, 0x45, 0x4d, 0x5f, 0x41, 0x52, 0x47, 0x20, 0x5f, +0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x0a, 0x2f, 0x2a, 0x20, 0x4e, +0x41, 0x4e, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x6e, 0x64, +0x65, 0x66, 0x20, 0x4e, 0x55, 0x4c, 0x4c, 0x0a, 0x20, 0x20, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4e, 0x55, 0x4c, 0x4c, +0x20, 0x28, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x2a, 0x29, 0x30, 0x29, +0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, +0x49, 0x4e, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x59, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, +0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, +0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x31, +0x20, 0x67, 0x65, 0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, +0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x4c, 0x49, 0x44, 0x5f, 0x32, 0x20, 0x67, 0x65, +0x74, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, +0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, +0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, +0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x4c, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, 0x74, 0x5f, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x28, +0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x49, 0x44, 0x5f, 0x30, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, +0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, +0x5f, 0x31, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, +0x70, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x49, 0x44, 0x5f, 0x32, 0x20, +0x67, 0x65, 0x74, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x69, +0x64, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x30, 0x20, 0x67, 0x65, +0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, +0x73, 0x28, 0x30, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x31, 0x20, 0x67, 0x65, +0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, +0x73, 0x28, 0x31, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x47, 0x44, 0x49, 0x4d, 0x5f, 0x32, 0x20, 0x67, 0x65, +0x74, 0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x67, 0x72, 0x6f, 0x75, 0x70, +0x73, 0x28, 0x32, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x6f, 0x6f, 0x6c, 0x20, 0x75, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, -0x75, 0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, -0x20, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, -0x72, 0x74, 0x20, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, -0x6e, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x75, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x20, 0x63, +0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x62, 0x79, 0x74, 0x65, 0x20, 0x75, +0x63, 0x68, 0x61, 0x72, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, +0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, +0x74, 0x20, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, +0x74, 0x20, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x75, 0x69, 0x6e, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, -0x73, 0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, -0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, -0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, -0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, -0x2a, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, -0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, -0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, -0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, -0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, -0x53, 0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x74, 0x79, 0x70, 0x65, 0x64, -0x65, 0x66, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x5f, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, -0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, -0x0a, 0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, -0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x28, -0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x0a, 0x73, -0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, -0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, -0x66, 0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, -0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x76, 0x73, -0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, -0x74, 0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x72, -0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, -0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, -0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, -0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, -0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, -0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, -0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, -0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, -0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, +0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, +0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, +0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, +0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, 0x6c, +0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, +0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, +0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, +0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, +0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, +0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, +0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x74, 0x79, 0x70, 0x65, 0x64, 0x65, +0x66, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x5f, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, +0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, 0x0a, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x28, 0x70, +0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x0a, 0x73, 0x74, +0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, +0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, +0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, +0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x76, 0x73, 0x74, +0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x74, +0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x72, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, +0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, +0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, 0x45, +0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, 0x49, +0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x69, +0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, 0x61, +0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, +0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, +0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, +0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, -0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, +0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, -0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, +0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, -0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, -0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, -0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, +0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, +0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, -0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, +0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, -0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, -0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, -0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, -0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, -0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, +0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, +0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, -0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, -0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, +0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, -0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, -0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, -0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, -0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, +0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, +0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, +0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, -0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, +0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, -0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, -0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, +0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, -0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, -0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, +0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, +0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, -0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, +0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, -0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, -0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, +0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, +0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, 0x6f, +0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, 0x20, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x6c, +0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, -0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, -0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, -0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, -0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, -0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, -0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, +0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, +0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, +0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, +0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, +0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, +0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, -0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, -0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, -0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, -0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, -0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, -0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, -0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, +0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, -0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, +0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, -0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, +0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, +0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, -0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, -0x78, 0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, +0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, -0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, +0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, +0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, -0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, -0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, -0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, +0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, +0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, +0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, -0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, -0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, +0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, -0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, -0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, +0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, -0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, -0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, +0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, +0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, -0x6e, 0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, -0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, -0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, -0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, -0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, -0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, +0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, -0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, +0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, -0x20, 0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, +0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, -0x62, 0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x61, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, +0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, -0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, +0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, -0x78, 0x5d, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, +0x5d, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, -0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, -0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, +0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, +0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, -0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, -0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, +0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, +0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, -0x20, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x7d, 0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, -0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, -0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, -0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, -0x29, 0x0a, 0x00}; +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, +0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, +0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, +0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, +0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, +0x0a, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index cdbe9278b2..a19c4260a6 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -49,6 +49,7 @@ static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *); static int cuda_waits(gpudata *, int, CUstream); static int cuda_records(gpudata *, int, CUstream); static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags); +static void cuda_free(gpudata *); static int detect_arch(const char *prefix, char *ret, error *e); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); @@ -698,7 +699,6 @@ static int extract(gpudata *curr, gpudata *prev, size_t size) { return GA_NO_ERROR; } -static void cuda_free(gpudata *); static int cuda_write(gpudata *dst, size_t dstoff, const void *src, size_t sz); From 47bb6726025c5badcd03c03549c5ee23603a6aaa Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sat, 22 Jul 2017 14:23:01 -0700 Subject: [PATCH 471/597] NCCL 2.0 fix --- src/gpuarray_collectives_cuda_nccl.c | 32 ++++++++++++++-------------- src/loaders/libnccl.fn | 11 +++++----- src/loaders/libnccl.h | 22 ++++++++++--------- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index 99cd5f7e38..de80b715b6 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) { * \ref * ncclRedOp_t. * - * If invalid, return `nccl_NUM_OPS`. + * If invalid, return `ncclNumOps`. */ static inline ncclRedOp_t convert_reduce_op(int opcode) { switch (opcode) { @@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) { case GA_MAX: return ncclMax; case GA_MIN: return ncclMin; } - return nccl_NUM_OPS; + return ncclNumOps; } /** * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref * ncclDataType_t. * - * If invalid, return `nccl_NUM_TYPES`. + * If invalid, return `ncclNumTypes`. */ static inline ncclDataType_t convert_data_type(int typecode) { switch (typecode) { @@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) { case GA_ULONG: return ncclUint64; case GA_HALF: return ncclHalf; } - return nccl_NUM_TYPES; + return ncclNumTypes; } /** @@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc, // typecode must correspond to a valid ncclDataType_t if (datatype != NULL) { *datatype = convert_data_type(typecode); - if (*datatype == nccl_NUM_TYPES) + if (*datatype == ncclNumTypes) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type"); } // opcode must correspond to a valid ncclRedOp_t if (op != NULL) { *op = convert_reduce_op(opcode); - if (*op == nccl_NUM_OPS) + if (*op == ncclNumOps) return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op"); } // offsets must not be larger than gpudata's size itself @@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; gpudata *dst = NULL; int rank = 0; cuda_context *ctx; @@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); @@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclRedOp_t op = nccl_NUM_OPS; - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclRedOp_t op = ncclNumOps; + ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; @@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; @@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, gpucomm *comm) { // need dummy init so that compiler shuts up - ncclDataType_t datatype = nccl_NUM_TYPES; + ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; @@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest, // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR( - ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype, - (void *)(dest->ptr + offdest), comm->c, ctx->s)); + ctx, ncclAllGather((void *)(src->ptr + offsrc), + (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn index 64de5dd88e..f60fc38c65 100644 --- a/src/loaders/libnccl.fn +++ b/src/loaders/libnccl.fn @@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm)); DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count)); DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank)); DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result)); -DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, -cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream)); -DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file +DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream )); +DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); +DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); +DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h index 0139878c8f..7d70138e03 100644 --- a/src/loaders/libnccl.h +++ b/src/loaders/libnccl.h @@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; typedef enum { ncclSuccess = 0 } ncclResult_t; +/* Reduction operation selector */ typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, - nccl_NUM_OPS = 4 } ncclRedOp_t; - + ncclNumOps = 4 } ncclRedOp_t; /* Data types */ -typedef enum { ncclChar = 0, - ncclInt = 1, - ncclHalf = 2, - ncclFloat = 3, - ncclDouble = 4, - ncclInt64 = 5, - ncclUint64 = 6, - nccl_NUM_TYPES = 7 } ncclDataType_t; +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclNumTypes = 9 } ncclDataType_t; /** @endcond */ From c1d3e80c6a557d4a59746fcecbf0b5bfbc49e6f1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 21 Aug 2017 16:06:07 -0400 Subject: [PATCH 472/597] Block loading of nccl 1.0 --- src/loaders/libnccl.fn | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn index f60fc38c65..bb9f3ddefd 100644 --- a/src/loaders/libnccl.fn +++ b/src/loaders/libnccl.fn @@ -8,4 +8,6 @@ DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream )); DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); -DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); \ No newline at end of file +DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); +// We don't need this but we use it as a sentinel to prevent nccl 1.0 from loading. +DEF_PROC(ncclResult_t, ncclGroupStart, ()); From a2b31c2d070b601e69f7216baaa4e3424d63cfa9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 21 Aug 2017 18:15:19 -0400 Subject: [PATCH 473/597] Add some documentation. --- src/gpuarray/buffer.h | 7 ++++++- src/head.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 5a242d83b3..2be9265d5b 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -100,11 +100,16 @@ GPUARRAY_PUBLIC void gpucontext_props_del(gpucontext_props *p); * * \warning This function is not thread-safe. * + * The passed-in properties pointer will be managed by this function + * and needs not be freed. This means that you shouldn't touch the + * properties object after passing it to this function. + * * \param res a pointer to a location that will be allocated * \param name the backend name. * \param dev the device number. The precise meaning of the device * number is backend-dependent - * \param props a properties object for the context. Can be NULL for defaults. + * \param props a properties object for the context. Can be NULL for + * defaults. * * \returns GA_NO_ERROR or an error code if an error occurred. */ diff --git a/src/head.py b/src/head.py index 04b1b5dad3..ef27d89549 100644 --- a/src/head.py +++ b/src/head.py @@ -1,4 +1,6 @@ # Used to generate the string tables to embed the cluda headers. +# Usage: python head.py +# This will output .c def wrt(f, n, b): f.write(b',') From d2308b37aaaebd09c7686b0fa6d00dcb835b6ebe Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 22 Aug 2017 13:09:38 -0400 Subject: [PATCH 474/597] Add more documentation --- src/gpuarray/buffer.h | 110 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 2be9265d5b..caab56fa05 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -71,28 +71,136 @@ GPUARRAY_PUBLIC int gpu_get_device_count(const char* name, unsigned int platform, unsigned int* devcount); + +/** + * Opaque structure that holds properties for the context. + */ typedef struct _gpucontext_props gpucontext_props; +/** + * Allocate and initialized an instance of gpucontext_props. + * + * Initialization is done with default values. + * + * \param res pointer to storage space for the created object + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ GPUARRAY_PUBLIC int gpucontext_props_new(gpucontext_props **res); +/** + * Set the device number for a CUDA device. + * + * \param p properties object + * \param devno device number + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ GPUARRAY_PUBLIC int gpucontext_props_cuda_dev(gpucontext_props *p, int devno); + +/** + * Set the platform and device for OpenCL. + * + * \param p properties object + * \param platno platform number + * \param devno device number + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ GPUARRAY_PUBLIC int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno); +/** + * Set the scheduling mode for the device. + * + * \param p properties object + * \param sched scheduling mode. One of \ref sched_modes "these". + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ +GPUARRAY_PUBLIC int gpucontext_props_sched(gpucontext_props *p, int sched); + +/** \defgroup sched_modes + * @{ + */ + +/** + * Automatic scheduling, decide what to do depending on the workload, + * number of cores in the computer and other relevant factors. (default) + */ #define GA_CTX_SCHED_AUTO 0 + +/** + * Single-work scheduling. Optimize for speed in a single process, + * with a single thread. This is the fastest mode, but it may keep + * the CPU busy more than necessary. + */ #define GA_CTX_SCHED_SINGLE 1 + +/** + * Multi-work scheduling. Try to not keep the CPU busy more than + * necessary and let other threads a chance at some CPU time. This + * may increase the latency when waiting for GPU operations. + */ #define GA_CTX_SCHED_MULTI 2 -GPUARRAY_PUBLIC int gpucontext_props_sched(gpucontext_props *p, int sched); +/** @}*/ + +/** + * Set single-stream mode. + * + * All operations on the device will be serialized on a single stream. + * This will also disable most of the interlocking normally done + * between multiple streams to keep everything in order. + * + * This mode can be faster if you don't have a lot of device-level + * parallelism in your workload. + * + * \param p properties object + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ GPUARRAY_PUBLIC int gpucontext_props_set_single_stream(gpucontext_props *p); +/** + * Set the path for the kernel cache. + * + * The cache can be shared with other running instances, even on + * shared drives. + * + * \param p properties object + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ GPUARRAY_PUBLIC int gpucontext_props_kernel_cache(gpucontext_props *p, const char *path); +/** + * Configure the allocation cache. + * + * The maximum size is also a limit on the total amount of memory + * allocated on the device. + * + * \param p properties object + * \param initial initial size of the cache + * \param max maximum size of the cache + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ GPUARRAY_PUBLIC int gpucontext_props_alloc_cache(gpucontext_props *p, size_t initial, size_t max); +/** + * Free a properties object. + * + * This should not be called on a properties object that has been + * passed to gpucontext_init(). + * + * \param p properties object + * + * \returns GA_NO_ERROR or an error code if an error occurred. + */ GPUARRAY_PUBLIC void gpucontext_props_del(gpucontext_props *p); /** From e5705134ac07bd2e506cbda18bc19536ed3c2b66 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 22 Aug 2017 17:41:31 -0400 Subject: [PATCH 475/597] Fix collectives test. --- tests/check_collectives.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/check_collectives.c b/tests/check_collectives.c index 4868f48fc9..c739bd8a28 100644 --- a/tests/check_collectives.c +++ b/tests/check_collectives.c @@ -75,7 +75,9 @@ extern void teardown_comm(void); for (j = 0; j < indims[1]; ++j) \ A[i][j] = comm_rank + 2; \ \ - err = GpuArray_copy_from_host(&Adev, ctx, A, GA_INT, ND, indims, instrds); \ + err = GpuArray_empty(&Adev, ctx, GA_INT, ND, indims, GA_C_ORDER); \ + ck_assert_int_eq(err, GA_NO_ERROR); \ + err = GpuArray_write(&Adev, A, sizeof(*A) * inrows); \ ck_assert_int_eq(err, GA_NO_ERROR); \ err = GpuArray_empty(&RESdev, ctx, GA_INT, ND, outdims, GA_C_ORDER); \ ck_assert_int_eq(err, GA_NO_ERROR); From 981ff8ef4273b92d740617b2dac3767f06c5cfdd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 22 Aug 2017 17:54:56 -0400 Subject: [PATCH 476/597] Better error message for old NCCL. --- src/loaders/libnccl.c | 3 +++ src/loaders/libnccl.fn | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c index 4ef247d117..08d5643330 100644 --- a/src/loaders/libnccl.c +++ b/src/loaders/libnccl.c @@ -40,6 +40,9 @@ int load_libnccl(error *e) { #include "libnccl.fn" + if (ga_func_ptr(lib, "ncclGroupStart", e) == NULL) + return error_set(e, GA_LOAD_ERROR, "Found NCCL 1.0 but NCCL 2.0 required"); + loaded = 1; return GA_NO_ERROR; } diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn index bb9f3ddefd..caf365b849 100644 --- a/src/loaders/libnccl.fn +++ b/src/loaders/libnccl.fn @@ -9,5 +9,3 @@ DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, siz DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream)); DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream )); DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)); -// We don't need this but we use it as a sentinel to prevent nccl 1.0 from loading. -DEF_PROC(ncclResult_t, ncclGroupStart, ()); From 33f14c18d9ee91a794d5b1cd7c7761a50c900a6d Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 21 Jul 2017 00:16:33 -0700 Subject: [PATCH 477/597] Fixed fp16 breakage caused by CUDA9 changes --- src/cluda_cuda.h | 12 +- src/cluda_cuda.h.c | 607 +++++++++++++++++++------------------ src/gpuarray/ext_cuda.h | 1 + src/gpuarray_buffer_cuda.c | 13 +- 4 files changed, 323 insertions(+), 310 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index 7e8cc201b9..ed20a8eb1c 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -65,10 +65,14 @@ struct ga_half { ga_ushort data; }; -#define ga_half2float(p) __half2float((p).data) -__device__ static inline ga_half ga_float2half(float f) { +static __device__ inline float ga_half2float(ga_half h) { + float r; + asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(r) : "h"(h.data)); + return r; +} +static __device__ inline ga_half ga_float2half(float f) { ga_half r; - r.data = __float2half_rn(f); + asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(r.data) : "f"(f)); return r; } @@ -142,7 +146,7 @@ __device__ ga_half atom_add_eg(ga_half *addr, ga_half val) { do { assumed = old; tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); - sum = __float2half_rn(__half2float(val.data) + __half2float(tmp.data)); + sum = ga_float2half(ga_half2float(val) + ga_half2float(tmp)).data; new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index 797cef438c..ba3f88cadc 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -178,318 +178,329 @@ static const char cluda_cuda_h[] = { 0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, -0x7d, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, 0x5f, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x28, 0x70, -0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x5f, 0x5f, 0x64, -0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x73, 0x74, 0x61, -0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, -0x3b, 0x0a, 0x20, 0x20, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, -0x3d, 0x20, 0x5f, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, -0x61, 0x6c, 0x66, 0x5f, 0x72, 0x6e, 0x28, 0x66, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, -0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, -0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, -0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x7d, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, +0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, +0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, +0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x66, 0x33, +0x32, 0x2e, 0x66, 0x31, 0x36, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25, +0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22, +0x3d, 0x66, 0x22, 0x28, 0x72, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x68, +0x22, 0x28, 0x68, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, +0x3b, 0x0a, 0x7d, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, +0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, +0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, +0x61, 0x73, 0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, +0x72, 0x6e, 0x2e, 0x66, 0x31, 0x36, 0x2e, 0x66, 0x33, 0x32, 0x20, +0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, +0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x68, 0x22, 0x28, 0x72, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x66, 0x22, +0x28, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, +0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, +0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, -0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, -0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, -0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, -0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, -0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, -0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, -0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, -0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, -0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, -0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, -0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, -0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x28, 0x76, 0x61, -0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, -0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, -0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, -0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, -0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, -0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, -0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, -0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, -0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, -0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x72, 0x65, 0x73, 0x3b, -0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, -0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, -0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, -0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, -0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, -0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, -0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, -0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, -0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, -0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, -0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, -0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, -0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, -0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, -0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, -0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, -0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, -0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, -0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, -0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x6f, 0x6c, 0x64, -0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, -0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, -0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, +0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, +0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, -0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, -0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, -0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, -0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, -0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, -0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, 0x29, 0x3b, 0x0a, 0x7d, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, -0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, -0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, -0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, -0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, -0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, +0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, +0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, +0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, +0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, -0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, -0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, -0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, -0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, -0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, -0x6e, 0x28, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, -0x61, 0x29, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, -0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, -0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, -0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, -0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, -0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, -0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, -0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, -0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, +0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x2c, 0x20, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, +0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, +0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, +0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x29, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, +0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, +0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x76, 0x61, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, +0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x29, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, +0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, +0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, +0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, +0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, +0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, +0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, +0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, +0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, +0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, +0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, +0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, +0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, +0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, +0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, +0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, +0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, +0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, +0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, +0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, +0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, +0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, +0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, +0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, +0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, +0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, +0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, +0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, +0x65, 0x73, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, +0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, +0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, +0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, +0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, +0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, +0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, +0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, -0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, -0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, -0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, -0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, -0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, -0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, +0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, +0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, +0x6c, 0x29, 0x20, 0x2b, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, +0x29, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, +0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, +0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, +0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, +0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, +0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, +0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, +0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, +0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, +0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, -0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, -0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, -0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, -0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, -0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, -0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, -0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, -0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, -0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, -0x64, 0x69, 0x66, 0x0a, 0x00}; +0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, +0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, +0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, +0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, +0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, +0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, +0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, +0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, +0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, +0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, +0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, +0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, +0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, +0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, +0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, +0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, +0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, +0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, +0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, +0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h index 4b6377fa2b..4231c4f455 100644 --- a/src/gpuarray/ext_cuda.h +++ b/src/gpuarray/ext_cuda.h @@ -2,6 +2,7 @@ #define LIBGPU_EXT_CUDA #include +#include #include #include diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index a19c4260a6..9a99d749c9 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -2,6 +2,7 @@ #include "private.h" #include "private_cuda.h" + #include "loaders/libnvrtc.h" #include "loaders/libcublas.h" @@ -1087,9 +1088,11 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { size_t buflen; const char *heads[1] = {"cluda.h"}; const char *hsrc[1]; - const char *opts[4] = { + const char *opts[] = { "-arch", "" +#ifdef DEBUG , "-G", "-lineinfo" +#endif }; nvrtcResult err; @@ -1100,13 +1103,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { if (err != NVRTC_SUCCESS) return error_nvrtc(ctx->err, "nvrtcCreateProgram", err); - err = nvrtcCompileProgram(prog, -#ifdef DEBUG - 4, -#else - 2, -#endif - opts); + err = nvrtcCompileProgram(prog, sizeof(opts)/sizeof(char *), opts); /* Get the log before handling the error */ if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) { From b73a39fffa77337d45fe1afe7846c9c420024f02 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 21 Aug 2017 15:34:07 -0400 Subject: [PATCH 478/597] Remove unused error messages in gpuarray_elemwise.c --- src/gpuarray_elemwise.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 2c51ffa723..3ba31d30fe 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -547,7 +547,6 @@ static int gen_elemwise_contig_kernel(GpuKernel *k, static int check_contig(GpuElemwise *ge, void **args, size_t *_n, int *contig) { GpuArray *a = NULL, *v; - gpucontext *ctx = GpuKernel_context(&ge->k_contig); size_t n = 1; unsigned int i, j; int c_contig = 1, f_contig = 1; @@ -563,10 +562,10 @@ static int check_contig(GpuElemwise *ge, void **args, f_contig &= GpuArray_IS_F_CONTIGUOUS(v); if (a != v) { if (a->nd != v->nd) - return error_fmt(ctx->err, GA_INVALID_ERROR, "Mismatched nd for input %u (expected %u, got %u)", i, a->nd, v->nd); + return -1; /* We don't check the value of the error code */ for (j = 0; j < a->nd; j++) { if (v->dimensions[j] != a->dimensions[j]) - return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u (expected %" SPREFIX "u, got %" SPREFIX "u)", j, a->dimensions[j], v->dimensions[j]); + return -1; /* We don't check the value of the error code */ } } } From 44104d28b498a7b40c533dcb165745f8997a6fbb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 23 Aug 2017 14:53:20 -0400 Subject: [PATCH 479/597] Initial recipes for pygpu/libgpuarray. --- conda/libgpuarray/build.sh | 2 ++ conda/libgpuarray/meta.yaml | 39 ++++++++++++++++++++++++++++++++ conda/pygpu/meta.yaml | 44 +++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 conda/libgpuarray/build.sh create mode 100644 conda/libgpuarray/meta.yaml create mode 100644 conda/pygpu/meta.yaml diff --git a/conda/libgpuarray/build.sh b/conda/libgpuarray/build.sh new file mode 100644 index 0000000000..f55dc95448 --- /dev/null +++ b/conda/libgpuarray/build.sh @@ -0,0 +1,2 @@ +cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX +make install diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml new file mode 100644 index 0000000000..73145afa6b --- /dev/null +++ b/conda/libgpuarray/meta.yaml @@ -0,0 +1,39 @@ +{% set version = "0.7.0" %} + +package: + name: libgpuarray + version: {{ version }} + +source: + fn: libgpuarray-{{ version }}.tar.gz + url: https://github.com/Theano/libgpuarray/archive/v{{ version }}.tar.gz + +build: + number: 0 + skip: true # [win and py35] + features: + - vc9 # [win and py27] + - vc10 # [win and py34] + - vc14 # [win and py35] + - vc14 # [win and py36] + +requirements: + build: + - cmake + - python # [win] + +test: + requires: + - python {{ environ['PY_VER'] + '*' }} # [win] + commands: + - test -f ${PREFIX}/lib/libgpuarray.dylib # [osx] + - test -f ${PREFIX}/lib/libgpuarray.so # [linux] + - if not exist %PREFIX%\\Library\\lib\\gpuarray.lib exit 1 # [win] + +about: + home: http://github.com/Theano/libgpuarray + license: ISC + license_file: LICENSE + summary: 'Library to manipulate arrays on GPU' + doc_url: http://deeplearning.net/software/libgpuarray/ + dev_url: http://github.com/Theano/libgpuarray diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml new file mode 100644 index 0000000000..bc34a9d6dd --- /dev/null +++ b/conda/pygpu/meta.yaml @@ -0,0 +1,44 @@ +{% set version = "0.7.0" %} + +package: + name: pygpu + version: {{ version }} + +source: + fn: libgpuarray-{{ version }}.tar.gz + url: https://github.com/Theano/libgpuarray/archive/v{{ version }}.tar.gz + +build: + number: 0 + script: + - export CFLAGS="${CFLAGS} -I${PREFIX}/include -L${PREFIX}/lib" # [unix] + - python setup.py install --single-version-externally-managed --record record.txt + +requirements: + build: + - python + - cython >=0.25 + - numpy x.x + - mako + - setuptools + - libgpuarray =={{ version }} + + run: + - python + - numpy x.x + - mako + - six + - libgpuarray =={{ version }} + +test: + imports: + - pygpu + - pygpu.gpuarray + +about: + home: http://github.com/Theano/libgpuarray + license: ISC + license_file: LICENSE + summary: 'Library to manipulate arrays on GPU' + doc_url: http://deeplearning.net/software/libgpuarray/ + dev_url: http://github.com/Theano/libgpuarray From d7b41e21379f3b7ebb6d8ee3fab3d00ea2ae0d27 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 23 Aug 2017 17:56:39 -0400 Subject: [PATCH 480/597] Fix collectives tests. --- pygpu/collectives.pyx | 2 +- pygpu/tests/collectives/test_collectives.py | 6 +++--- tests/check_collectives.c | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx index 39603e85d2..bb8037c5d4 100644 --- a/pygpu/collectives.pyx +++ b/pygpu/collectives.pyx @@ -48,7 +48,7 @@ cdef class GpuCommCliqueId: buffer.readonly = 0 buffer.itemsize = sizeof(char) if flags & PyBUF_FORMAT == PyBUF_FORMAT: - buffer.format = 'b' + buffer.format = 'B' else: buffer.format = NULL buffer.ndim = 1 diff --git a/pygpu/tests/collectives/test_collectives.py b/pygpu/tests/collectives/test_collectives.py index 9754873aab..2eebf00dff 100644 --- a/pygpu/tests/collectives/test_collectives.py +++ b/pygpu/tests/collectives/test_collectives.py @@ -293,19 +293,19 @@ def test_all_gather(self): a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2 * self.size), order='F') - gpu = gpuarray.asarray(a, context=self.ctx) + gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=0) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, self.size), order='F') - gpu = gpuarray.asarray(a, context=self.ctx) + gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=1) check_all(resgpu, exp) a = cpu.reshape((5, 2), order='F') exp = texp.reshape((5, 2, 1, 1, self.size), order='F') - gpu = gpuarray.asarray(a, context=self.ctx) + gpu = gpuarray.asarray(a, context=self.ctx, order='F') resgpu = self.gpucomm.all_gather(gpu, nd_up=3) check_all(resgpu, exp) diff --git a/tests/check_collectives.c b/tests/check_collectives.c index c739bd8a28..4db0614915 100644 --- a/tests/check_collectives.c +++ b/tests/check_collectives.c @@ -49,7 +49,6 @@ extern void teardown_comm(void); int(*EXP)[(outcols)]; \ size_t indims[ND]; \ size_t outdims[ND]; \ - const ssize_t instrds[ND] = {sizeof(*A), sizeof(int)}; \ const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)}; \ int err; \ size_t i, j, outsize; \ From 61b7cac28ae9330e481bced06cb029afeed3009f Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 24 Aug 2017 09:46:56 -0400 Subject: [PATCH 481/597] Move tests to the regular directory. This make is installed as other tests. It wasn't the case. --- pygpu/tests/{collectives => }/test_collectives.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pygpu/tests/{collectives => }/test_collectives.py (100%) diff --git a/pygpu/tests/collectives/test_collectives.py b/pygpu/tests/test_collectives.py similarity index 100% rename from pygpu/tests/collectives/test_collectives.py rename to pygpu/tests/test_collectives.py From 8ac2448c6ca339a72a894fdc72f2de8b60ee01d5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 24 Aug 2017 17:05:58 -0400 Subject: [PATCH 482/597] Remove the buffer functionality for GpuCommCliqueId since it crashes hard in some case and isn't necessary. --- pygpu/collectives.pyx | 35 --------------------------------- pygpu/tests/test_collectives.py | 7 ------- 2 files changed, 42 deletions(-) diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx index bb8037c5d4..c2ac3984c2 100644 --- a/pygpu/collectives.pyx +++ b/pygpu/collectives.pyx @@ -38,41 +38,6 @@ cdef class GpuCommCliqueId: if comm_id is not None: self.comm_id = comm_id - def __getbuffer__(self, Py_buffer* buffer, int flags): - if buffer == NULL: - raise BufferError, "NULL buffer view in getbuffer" - - buffer.buf = self.c_comm_id.internal - buffer.obj = self - buffer.len = GA_COMM_ID_BYTES * sizeof(char) - buffer.readonly = 0 - buffer.itemsize = sizeof(char) - if flags & PyBUF_FORMAT == PyBUF_FORMAT: - buffer.format = 'B' - else: - buffer.format = NULL - buffer.ndim = 1 - if flags & PyBUF_ND == PyBUF_ND: - buffer.shape = calloc(1, sizeof(Py_ssize_t)) - buffer.shape[0] = GA_COMM_ID_BYTES - else: - buffer.shape = NULL - if flags & PyBUF_STRIDES == PyBUF_STRIDES: - buffer.strides = &buffer.itemsize - else: - buffer.strides = NULL - buffer.suboffsets = NULL - buffer.internal = NULL - Py_INCREF(self) - - def __releasebuffer__(self, Py_buffer* buffer): - if buffer == NULL: - raise BufferError, "NULL buffer view in releasebuffer" - - if buffer.shape != NULL: - free(buffer.shape) - Py_DECREF(self) - def __richcmp__(this, that, int op): if type(this) != type(that): raise TypeError, "Cannot compare %s with %s" % (type(this), type(that)) diff --git a/pygpu/tests/test_collectives.py b/pygpu/tests/test_collectives.py index 2eebf00dff..a5f742f742 100644 --- a/pygpu/tests/test_collectives.py +++ b/pygpu/tests/test_collectives.py @@ -88,13 +88,6 @@ def test_richcmp(self): with self.assertRaises(TypeError): a = cid2 > "asdfasfa" - def test_as_buffer(self): - a = np.asarray(self.cid) - assert np.allclose(a, self.cid.comm_id) - a[:] = [ord(b'a')] * COMM_ID_BYTES - assert np.allclose(a, self.cid.comm_id) - - @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module") @unittest.skipIf(get_user_gpu_rank() == -1, "Collective operations supported on CUDA devices only") class TestGpuComm(unittest.TestCase): From f2c36e11df980e54efa7179da1ce000718a186f9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 25 Aug 2017 13:40:42 -0400 Subject: [PATCH 483/597] Remove tests with BYTE since that is not supported on many MPI implementations. --- tests/check_buffer_collectives.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/check_buffer_collectives.c b/tests/check_buffer_collectives.c index 10c1f41a7c..135b8a76c0 100644 --- a/tests/check_buffer_collectives.c +++ b/tests/check_buffer_collectives.c @@ -171,10 +171,6 @@ TEST_REDUCE(int, INT, INT, SUM, 0, PRINTVI) TEST_REDUCE(int, INT, INT, PROD, 0, PRINTVI) TEST_REDUCE(int, INT, INT, MAX, 0, PRINTVI) TEST_REDUCE(int, INT, INT, MIN, 0, PRINTVI) -TEST_REDUCE(char, BYTE, BYTE, SUM, 0, PRINTVI) -TEST_REDUCE(char, BYTE, BYTE, PROD, 0, PRINTVI) -TEST_REDUCE(char, BYTE, BYTE, MAX, 0, PRINTVI) -TEST_REDUCE(char, BYTE, BYTE, MIN, 0, PRINTVI) TEST_REDUCE(float, FLOAT, FLOAT, SUM, EPS, PRINTVF) TEST_REDUCE(float, FLOAT, FLOAT, PROD, EPS, PRINTVF) TEST_REDUCE(float, FLOAT, FLOAT, MAX, EPS, PRINTVF) @@ -265,10 +261,6 @@ TEST_ALL_REDUCE(int, INT, INT, SUM, 0, PRINTVI) TEST_ALL_REDUCE(int, INT, INT, PROD, 0, PRINTVI) TEST_ALL_REDUCE(int, INT, INT, MAX, 0, PRINTVI) TEST_ALL_REDUCE(int, INT, INT, MIN, 0, PRINTVI) -TEST_ALL_REDUCE(char, BYTE, BYTE, SUM, 0, PRINTVI) -TEST_ALL_REDUCE(char, BYTE, BYTE, PROD, 0, PRINTVI) -TEST_ALL_REDUCE(char, BYTE, BYTE, MAX, 0, PRINTVI) -TEST_ALL_REDUCE(char, BYTE, BYTE, MIN, 0, PRINTVI) TEST_ALL_REDUCE(float, FLOAT, FLOAT, SUM, EPS, PRINTVF) TEST_ALL_REDUCE(float, FLOAT, FLOAT, PROD, EPS, PRINTVF) TEST_ALL_REDUCE(float, FLOAT, FLOAT, MAX, EPS, PRINTVF) @@ -373,10 +365,6 @@ TEST_REDUCE_SCATTER(int, INT, INT, SUM, 0, PRINTVI) TEST_REDUCE_SCATTER(int, INT, INT, PROD, 0, PRINTVI) TEST_REDUCE_SCATTER(int, INT, INT, MAX, 0, PRINTVI) TEST_REDUCE_SCATTER(int, INT, INT, MIN, 0, PRINTVI) -TEST_REDUCE_SCATTER(char, BYTE, BYTE, SUM, 0, PRINTVI) -TEST_REDUCE_SCATTER(char, BYTE, BYTE, PROD, 0, PRINTVI) -TEST_REDUCE_SCATTER(char, BYTE, BYTE, MAX, 0, PRINTVI) -TEST_REDUCE_SCATTER(char, BYTE, BYTE, MIN, 0, PRINTVI) TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, SUM, EPS, PRINTVF) TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, PROD, EPS, PRINTVF) TEST_REDUCE_SCATTER(float, FLOAT, FLOAT, MAX, EPS, PRINTVF) @@ -577,10 +565,6 @@ Suite* get_suite(void) { tcase_add_test(reds, test_gpucomm_reduce_INT_PROD); tcase_add_test(reds, test_gpucomm_reduce_INT_MAX); tcase_add_test(reds, test_gpucomm_reduce_INT_MIN); - tcase_add_test(reds, test_gpucomm_reduce_BYTE_SUM); - tcase_add_test(reds, test_gpucomm_reduce_BYTE_PROD); - tcase_add_test(reds, test_gpucomm_reduce_BYTE_MAX); - tcase_add_test(reds, test_gpucomm_reduce_BYTE_MIN); tcase_add_test(reds, test_gpucomm_reduce_FLOAT_SUM); tcase_add_test(reds, test_gpucomm_reduce_FLOAT_PROD); tcase_add_test(reds, test_gpucomm_reduce_FLOAT_MAX); @@ -611,10 +595,6 @@ Suite* get_suite(void) { tcase_add_test(areds, test_gpucomm_all_reduce_INT_PROD); tcase_add_test(areds, test_gpucomm_all_reduce_INT_MAX); tcase_add_test(areds, test_gpucomm_all_reduce_INT_MIN); - tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_SUM); - tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_PROD); - tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_MAX); - tcase_add_test(areds, test_gpucomm_all_reduce_BYTE_MIN); tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_SUM); tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_PROD); tcase_add_test(areds, test_gpucomm_all_reduce_FLOAT_MAX); @@ -646,10 +626,6 @@ Suite* get_suite(void) { tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_PROD); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_MAX); tcase_add_test(redscs, test_gpucomm_reduce_scatter_INT_MIN); - tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_SUM); - tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_PROD); - tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_MAX); - tcase_add_test(redscs, test_gpucomm_reduce_scatter_BYTE_MIN); tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_SUM); tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_PROD); tcase_add_test(redscs, test_gpucomm_reduce_scatter_FLOAT_MAX); From be5ccb5886b87b44f8c27907a547db644348c5dd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 28 Aug 2017 13:56:14 -0400 Subject: [PATCH 484/597] Improve the package scripts. --- conda/libgpuarray/bld.bat | 8 ++++++++ conda/libgpuarray/build.sh | 11 +++++++++-- conda/libgpuarray/meta.yaml | 21 ++++++++++----------- conda/pygpu/bld.bat | 3 +++ conda/pygpu/build.sh | 4 ++++ conda/pygpu/meta.yaml | 12 ++++-------- 6 files changed, 38 insertions(+), 21 deletions(-) create mode 100644 conda/libgpuarray/bld.bat create mode 100644 conda/pygpu/bld.bat create mode 100644 conda/pygpu/build.sh diff --git a/conda/libgpuarray/bld.bat b/conda/libgpuarray/bld.bat new file mode 100644 index 0000000000..54be8a24bd --- /dev/null +++ b/conda/libgpuarray/bld.bat @@ -0,0 +1,8 @@ +cmake -G"NMake Makefiles" ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ + -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ + -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^ + "%SRC_DIR%" +cmake --build . --config Release --target ALL_BUILD +cmake --build . --config Release --target install \ No newline at end of file diff --git a/conda/libgpuarray/build.sh b/conda/libgpuarray/build.sh index f55dc95448..7e2ea03787 100644 --- a/conda/libgpuarray/build.sh +++ b/conda/libgpuarray/build.sh @@ -1,2 +1,9 @@ -cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -make install +#!/bin/bash + +if [[ $(uname) == Darwin ]]; then + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_OSX_DEPLOYMENT_TARGET= +else + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX +fi +cmake --build . --config Release --target all +cmake --build . --config Release --target install diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index 73145afa6b..a6032b760c 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -14,21 +14,20 @@ build: features: - vc9 # [win and py27] - vc10 # [win and py34] - - vc14 # [win and py35] - - vc14 # [win and py36] + - vc14 # [win and (py35 or py36)] requirements: build: + - m2-git [win] + - m2-filesystem [win] + - git [not win] - cmake - - python # [win] - -test: - requires: - - python {{ environ['PY_VER'] + '*' }} # [win] - commands: - - test -f ${PREFIX}/lib/libgpuarray.dylib # [osx] - - test -f ${PREFIX}/lib/libgpuarray.so # [linux] - - if not exist %PREFIX%\\Library\\lib\\gpuarray.lib exit 1 # [win] + - mako + - python + run: + - vs2008_runtime [win and py27] + - vs2010_runtime [win and py34] + - vs2015_runtime [win and (py35 or py36)] about: home: http://github.com/Theano/libgpuarray diff --git a/conda/pygpu/bld.bat b/conda/pygpu/bld.bat new file mode 100644 index 0000000000..c20afe1dd4 --- /dev/null +++ b/conda/pygpu/bld.bat @@ -0,0 +1,3 @@ +set LIB=%LIBRARY_LIB%;%LIB% +set INCLUDE=%LIBRARY_INC%;%INCLUDE% +%PYTHON% setup.py install --single-version-externally-managed --record=record.txt \ No newline at end of file diff --git a/conda/pygpu/build.sh b/conda/pygpu/build.sh new file mode 100644 index 0000000000..9a446aa728 --- /dev/null +++ b/conda/pygpu/build.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +export CFLAGS=${CFLAGS}" -I${PREFIX}/include -L${PREFIX}/lib" +$PYTHON setup.py install --single-version-externally-managed --record=record.txt \ No newline at end of file diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index bc34a9d6dd..5ef71964ae 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -10,12 +10,13 @@ source: build: number: 0 - script: - - export CFLAGS="${CFLAGS} -I${PREFIX}/include -L${PREFIX}/lib" # [unix] - - python setup.py install --single-version-externally-managed --record record.txt + detect_binary_files_with_prefix: False requirements: build: + - m2-git [win] + - m2-filesystem [win] + - git [not win] - python - cython >=0.25 - numpy x.x @@ -30,11 +31,6 @@ requirements: - six - libgpuarray =={{ version }} -test: - imports: - - pygpu - - pygpu.gpuarray - about: home: http://github.com/Theano/libgpuarray license: ISC From ea1eeeaf41c175623092f20537a86b3bacafdb53 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 30 Aug 2017 12:16:18 -0400 Subject: [PATCH 485/597] Changes for release 0.7.0 --- doc/conf.py | 4 ++-- release.txt | 1 + setup.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index e8ee873f82..7768f2ed3b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -57,9 +57,9 @@ # built documents. # # The short X.Y version. -version = '0.6' +version = '0.7' # The full version, including alpha/beta/rc tags. -release = '0.6.9' +release = '0.7.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/release.txt b/release.txt index a90f758d39..1d5922fe14 100644 --- a/release.txt +++ b/release.txt @@ -3,6 +3,7 @@ Release process: - Update the version in setup.py - Update the version in doc/conf.py - Update the version in .appveyor.yml +- Update the version in conda/{libgpuarray,pygpu}/meta.yaml - Commit the changes with message "Changes for release X.Y.Z" git commit -m "Changes for release X.Y.Z" - Make a git tag diff --git a/setup.py b/setup.py index dc8c1cbec4..29e9f60e77 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ MAJOR = 0 MINOR = 7 PATCH = 0 -SUFFIX = '.dev0' # include the '.' +SUFFIX = '' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: From c1cd48f8c666134d2e5f309a5e5580054e571cfb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 30 Aug 2017 12:51:53 -0400 Subject: [PATCH 486/597] Use conda_build_config to build for multiple python/numpy. --- conda/libgpuarray/conda_build_config.yaml | 4 ++++ conda/libgpuarray/meta.yaml | 2 +- conda/pygpu/conda_build_config.yaml | 8 ++++++++ conda/pygpu/meta.yaml | 8 ++++---- 4 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 conda/libgpuarray/conda_build_config.yaml create mode 100644 conda/pygpu/conda_build_config.yaml diff --git a/conda/libgpuarray/conda_build_config.yaml b/conda/libgpuarray/conda_build_config.yaml new file mode 100644 index 0000000000..21ce93b651 --- /dev/null +++ b/conda/libgpuarray/conda_build_config.yaml @@ -0,0 +1,4 @@ +python: + - 2.7 + - 3.5 + - 3.6 diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index a6032b760c..cb3beb33fe 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -23,7 +23,7 @@ requirements: - git [not win] - cmake - mako - - python + - python # version doesn't matter here run: - vs2008_runtime [win and py27] - vs2010_runtime [win and py34] diff --git a/conda/pygpu/conda_build_config.yaml b/conda/pygpu/conda_build_config.yaml new file mode 100644 index 0000000000..cbc7b371b6 --- /dev/null +++ b/conda/pygpu/conda_build_config.yaml @@ -0,0 +1,8 @@ +python: + - 2.7 + - 3.5 + - 3.6 +numpy: + - 1.11 + - 1.12 + - 1.13 \ No newline at end of file diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index 5ef71964ae..203e241a96 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -17,16 +17,16 @@ requirements: - m2-git [win] - m2-filesystem [win] - git [not win] - - python + - python {{ python }} - cython >=0.25 - - numpy x.x + - numpy {{ numpy }} - mako - setuptools - libgpuarray =={{ version }} run: - - python - - numpy x.x + - python {{ python }} + - numpy {{ numpy }} - mako - six - libgpuarray =={{ version }} From 32d498d96ae5df4cfdd0abc2eef1f0b2d4c1037f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 30 Aug 2017 15:03:12 -0400 Subject: [PATCH 487/597] Fix the numpy dependency. --- conda/libgpuarray/meta.yaml | 1 + conda/pygpu/meta.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index cb3beb33fe..fd6d70a99f 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -11,6 +11,7 @@ source: build: number: 0 skip: true # [win and py35] + skip: true # [not win and not py27] features: - vc9 # [win and py27] - vc10 # [win and py34] diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index 203e241a96..bfb07adfca 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -26,7 +26,7 @@ requirements: run: - python {{ python }} - - numpy {{ numpy }} + - {{ pin_compatible('numpy') }} - mako - six - libgpuarray =={{ version }} From 9e1ebf44ff4a4bb4f26b06b1b2cc289c55ddf81d Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 31 Aug 2017 15:09:04 -0400 Subject: [PATCH 488/597] Remove include that break cuda8 and isn't need anymore in cuda9. --- src/gpuarray/ext_cuda.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h index 4231c4f455..4b6377fa2b 100644 --- a/src/gpuarray/ext_cuda.h +++ b/src/gpuarray/ext_cuda.h @@ -2,7 +2,6 @@ #define LIBGPU_EXT_CUDA #include -#include #include #include From 9f01cb4eb7825efcf2467a478bbadd4f0542b155 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 31 Aug 2017 16:14:00 -0400 Subject: [PATCH 489/597] Fix tests. --- pygpu/tests/test_collectives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/tests/test_collectives.py b/pygpu/tests/test_collectives.py index a5f742f742..4e688793d1 100644 --- a/pygpu/tests/test_collectives.py +++ b/pygpu/tests/test_collectives.py @@ -101,7 +101,7 @@ def setUpClass(cls): cls.ctx = gpuarray.init("cuda" + str(cls.rank)) print("*** Collectives testing for", cls.ctx.devname, file=sys.stderr) cls.cid = GpuCommCliqueId(context=cls.ctx) - cls.mpicomm.Bcast(cls.cid, root=0) + cls.mpicomm.Bcast(cls.cid.comm_id, root=0) cls.gpucomm = GpuComm(cls.cid, cls.size, cls.rank) def test_count(self): From d19813fd33f9a0fc11803f970dc5f6c51159aaaf Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 11:27:03 -0400 Subject: [PATCH 490/597] Make appveyor build conda packages. --- .appveyor.yml | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 2018f0d80a..ee2feb3749 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -3,8 +3,6 @@ version: '0.7.0.{build}' pull_requests: do_not_increment_build_number: true -image: Visual Studio 2015 - init: - git config --global core.autocrlf input - cmd: cmake --version @@ -17,29 +15,21 @@ clone_folder: C:\projects\libgpuarray configuration: - Release - - Debug environment: matrix: - - PYTHON: "C:\\Python27" - VS_PATH: "C:\\Users\\appveyor\\AppData\\Local\\Programs\\Common\\Microsoft\\Visual C++ for Python\\9.0" - - PYTHON: "C:\\Python35" - VS_PATH: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC" + - CONDA_LOC: "C:\\Miniconda-x64" install: - - python -m pip install mako cython + # This breaks conda-build because of git + - cmd: rmdir C:\cygwin /s /q + - cmd: call %CONDA_LOC%\Scripts\activate.bat + - cmd: set PYTHONUNBUFFERED=1 + - cmd: conda install -n root --yes conda conda-env conda-build anaconda-client + +build: script build_script: - - echo "Python:" "%PYTHON%" - - echo "Config:" "%CONFIGURATION%" - - echo "VS path:" "%VS_PATH%" - - cd "%VS_PATH%" - - vcvarsall x64 - - set - - cd C:\projects\libgpuarray - - md %CONFIGURATION% - - cd %CONFIGURATION% - - cmake .. -DCMAKE_BUILD_TYPE=%CONFIGURATION% -G "NMake Makefiles" - - cmake --build . --config %CONFIGURATION% + - conda build conda -build: script +#deploy_script: From 1d82d00cf15815211f7a2c97c704a63aa61bfcaa Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 12:13:40 -0400 Subject: [PATCH 491/597] Add compiler requirement. --- conda/libgpuarray/meta.yaml | 1 + conda/pygpu/meta.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index fd6d70a99f..9691d31120 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -22,6 +22,7 @@ requirements: - m2-git [win] - m2-filesystem [win] - git [not win] + - {{ compiler('c') }} - cmake - mako - python # version doesn't matter here diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index bfb07adfca..e47e1c571b 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -17,6 +17,7 @@ requirements: - m2-git [win] - m2-filesystem [win] - git [not win] + - {{ compiler('c') }} - python {{ python }} - cython >=0.25 - numpy {{ numpy }} From 2d608df327fcede32fed9527811bcbdef3138d98 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 12:48:54 -0400 Subject: [PATCH 492/597] Attempt at fixing the compiler issue --- .appveyor.yml | 5 ----- conda/libgpuarray/bld.bat | 7 ++++++- conda/libgpuarray/meta.yaml | 4 +--- conda/pygpu/meta.yaml | 1 - 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index ee2feb3749..6e263169da 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -3,11 +3,6 @@ version: '0.7.0.{build}' pull_requests: do_not_increment_build_number: true -init: - - git config --global core.autocrlf input - - cmd: cmake --version - - cmd: msbuild /version - platform: - x64 diff --git a/conda/libgpuarray/bld.bat b/conda/libgpuarray/bld.bat index 54be8a24bd..4ce87d6f3e 100644 --- a/conda/libgpuarray/bld.bat +++ b/conda/libgpuarray/bld.bat @@ -1,8 +1,13 @@ +set + cmake -G"NMake Makefiles" ^ -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^ "%SRC_DIR%" +if errorlevel 1 exit 1 cmake --build . --config Release --target ALL_BUILD -cmake --build . --config Release --target install \ No newline at end of file +if errorlevel 1 exit 1 +cmake --build . --config Release --target install +if errorlevel 1 exit 1 diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index 9691d31120..5763af8835 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -5,8 +5,7 @@ package: version: {{ version }} source: - fn: libgpuarray-{{ version }}.tar.gz - url: https://github.com/Theano/libgpuarray/archive/v{{ version }}.tar.gz + path: ../../ build: number: 0 @@ -22,7 +21,6 @@ requirements: - m2-git [win] - m2-filesystem [win] - git [not win] - - {{ compiler('c') }} - cmake - mako - python # version doesn't matter here diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index e47e1c571b..bfb07adfca 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -17,7 +17,6 @@ requirements: - m2-git [win] - m2-filesystem [win] - git [not win] - - {{ compiler('c') }} - python {{ python }} - cython >=0.25 - numpy {{ numpy }} From 388896d8312420196e9062cc288cba70ad015135 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 12:55:22 -0400 Subject: [PATCH 493/597] Fix bld.bat --- conda/libgpuarray/bld.bat | 2 -- 1 file changed, 2 deletions(-) diff --git a/conda/libgpuarray/bld.bat b/conda/libgpuarray/bld.bat index 4ce87d6f3e..0811a25cb8 100644 --- a/conda/libgpuarray/bld.bat +++ b/conda/libgpuarray/bld.bat @@ -7,7 +7,5 @@ cmake -G"NMake Makefiles" ^ -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^ "%SRC_DIR%" if errorlevel 1 exit 1 -cmake --build . --config Release --target ALL_BUILD -if errorlevel 1 exit 1 cmake --build . --config Release --target install if errorlevel 1 exit 1 From 6ed12180488e2afd306ef0b375956b8e4a4244d1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 13:01:45 -0400 Subject: [PATCH 494/597] Maybe working windows build? --- conda/libgpuarray/bld.bat | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/conda/libgpuarray/bld.bat b/conda/libgpuarray/bld.bat index 0811a25cb8..8989a9c032 100644 --- a/conda/libgpuarray/bld.bat +++ b/conda/libgpuarray/bld.bat @@ -1,11 +1,10 @@ -set - -cmake -G"NMake Makefiles" ^ - -DCMAKE_BUILD_TYPE=Release ^ +cmake -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^ "%SRC_DIR%" if errorlevel 1 exit 1 +cmake --build . --config Release --target ALL_BUILD +if errorlevel 1 exit 1 cmake --build . --config Release --target install if errorlevel 1 exit 1 From 1b2aa4cab3ccde2849c2c962cdd7a1def70ea1be Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 13:42:49 -0400 Subject: [PATCH 495/597] Add versioneer for better versions. --- .gitattributes | 1 + MANIFEST.in | 2 + pygpu/__init__.py | 6 +- pygpu/_version.py | 520 +++++++++++++ setup.cfg | 7 + setup.py | 20 +- versioneer.py | 1822 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 2359 insertions(+), 19 deletions(-) create mode 100644 .gitattributes create mode 100644 MANIFEST.in create mode 100644 pygpu/_version.py create mode 100644 setup.cfg create mode 100644 versioneer.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..3f619f34d6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +pygpu/_version.py export-subst diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000..4681f6b8ab --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include versioneer.py +include pygpu/_version.py diff --git a/pygpu/__init__.py b/pygpu/__init__.py index 566cd8d19b..efdbd7a8bb 100644 --- a/pygpu/__init__.py +++ b/pygpu/__init__.py @@ -12,11 +12,13 @@ def get_include(): concatenate, hstack, vstack, dstack) from ._array import ndgpuarray -from .version import fullversion as __version__ - +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions def test(): from . import tests from .tests import main if hasattr(main, "NoseTester"): main.NoseTester(package=tests).test() + diff --git a/pygpu/_version.py b/pygpu/_version.py new file mode 100644 index 0000000000..434e940fe1 --- /dev/null +++ b/pygpu/_version.py @@ -0,0 +1,520 @@ + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "libgpuarray-" + cfg.versionfile_source = "pygpu/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000..2cc056015a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[versioneer] +VCS=git +style=pep440 +versionfile_source=pygpu/_version.py +versionfile_build=pygpu/_version.py +tag_prefix=v +parentdir_prefix=libgpuarray- \ No newline at end of file diff --git a/setup.py b/setup.py index 29e9f60e77..911a1540e9 100755 --- a/setup.py +++ b/setup.py @@ -1,14 +1,9 @@ import sys import os +import versioneer have_cython = False -MAJOR = 0 -MINOR = 7 -PATCH = 0 -SUFFIX = '' # include the '.' -FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) - try: import Cython if Cython.__version__ < '0.25': @@ -87,16 +82,6 @@ def __init__(self, *args, **kwargs): raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir)) library_dirs += [default_bin_dir] -with open('pygpu/version.py', 'w') as f: - f.write(""" -# File generated by setup.py -major = %d -minor = %d -patch = %d -suffix = "%s" -fullversion = "%s" -""" % (MAJOR, MINOR, PATCH, SUFFIX, FULLVERSION)) - ea = [] if sys.platform in ('darwin', 'linux'): # Silence unused stuff warnings @@ -136,7 +121,8 @@ def __init__(self, *args, **kwargs): )] setup(name='pygpu', - version=FULLVERSION, + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), description='numpy-like wrapper on libgpuarray for GPU computations', packages=['pygpu', 'pygpu/tests'], include_package_data=True, diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000000..64fea1c892 --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1822 @@ + +# Version: 0.18 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/warner/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy +* [![Latest Version] +(https://pypip.in/version/versioneer/badge.svg?style=flat) +](https://pypi.python.org/pypi/versioneer/) +* [![Build Status] +(https://travis-ci.org/warner/python-versioneer.png?branch=master) +](https://travis-ci.org/warner/python-versioneer) + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere to your $PATH +* add a `[versioneer]` section to your setup.cfg (see below) +* run `versioneer install` in your source tree, commit the results + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes. + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/warner/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other langauges) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. + +### Unicode version strings + +While Versioneer works (and is continually tested) with both Python 2 and +Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. +Newer releases probably generate unicode version strings on py2. It's not +clear that this is wrong, but it may be surprising for applications when then +write these strings to a network connection or include them in bytes-oriented +APIs like cryptographic checksums. + +[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates +this question. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +""" + +from __future__ import print_function +try: + import configparser +except ImportError: + import ConfigParser as configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(me)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py)) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.SafeConfigParser() + with open(setup_cfg, "r") as f: + parser.readfp(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +LONG_VERSION_PY['git'] = ''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes", "r") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except EnvironmentError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.18) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except EnvironmentError: + raise NotThisMethod("unable to read _version.py") + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(): + """Get the custom setuptools/distutils subclasses used by Versioneer.""" + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/warner/python-versioneer/issues/52 + + cmds = {} + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if 'py2exe' in sys.modules: # py2exe enabled? + try: + from py2exe.distutils_buildexe import py2exe as _py2exe # py3 + except ImportError: + from py2exe.build_exe import py2exe as _py2exe # py2 + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except EnvironmentError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1) From ca7d1ba62668f14de98a8952123e7d45108fe4ea Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:07:00 -0400 Subject: [PATCH 496/597] Use versioneer to reduce the version updating madness when doing a release. --- .appveyor.yml | 2 +- conda/libgpuarray/conda_build_config.yaml | 4 ---- conda/libgpuarray/meta.yaml | 6 +----- conda/pygpu/conda_build_config.yaml | 8 -------- conda/pygpu/meta.yaml | 13 ++++++------- doc/conf.py | 7 ++++--- release.txt | 4 ---- 7 files changed, 12 insertions(+), 32 deletions(-) delete mode 100644 conda/libgpuarray/conda_build_config.yaml delete mode 100644 conda/pygpu/conda_build_config.yaml diff --git a/.appveyor.yml b/.appveyor.yml index 6e263169da..6d2b0e0b3a 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,4 @@ -version: '0.7.0.{build}' +version: '1.0.{build}' # This number doesn't matter pull_requests: do_not_increment_build_number: true diff --git a/conda/libgpuarray/conda_build_config.yaml b/conda/libgpuarray/conda_build_config.yaml deleted file mode 100644 index 21ce93b651..0000000000 --- a/conda/libgpuarray/conda_build_config.yaml +++ /dev/null @@ -1,4 +0,0 @@ -python: - - 2.7 - - 3.5 - - 3.6 diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index 5763af8835..f5069561dc 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -1,16 +1,12 @@ -{% set version = "0.7.0" %} - package: name: libgpuarray - version: {{ version }} + version: {{ environ.get('GIT_DESCRIBE_TAG')[1:] }} source: path: ../../ build: number: 0 - skip: true # [win and py35] - skip: true # [not win and not py27] features: - vc9 # [win and py27] - vc10 # [win and py34] diff --git a/conda/pygpu/conda_build_config.yaml b/conda/pygpu/conda_build_config.yaml deleted file mode 100644 index cbc7b371b6..0000000000 --- a/conda/pygpu/conda_build_config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -python: - - 2.7 - - 3.5 - - 3.6 -numpy: - - 1.11 - - 1.12 - - 1.13 \ No newline at end of file diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index bfb07adfca..0c20677e02 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -1,12 +1,11 @@ -{% set version = "0.7.0" %} +{% set version = environ.get('GIT_DESCRIBE_TAG')[1:] %} package: name: pygpu version: {{ version }} source: - fn: libgpuarray-{{ version }}.tar.gz - url: https://github.com/Theano/libgpuarray/archive/v{{ version }}.tar.gz + path: ../../ build: number: 0 @@ -17,16 +16,16 @@ requirements: - m2-git [win] - m2-filesystem [win] - git [not win] - - python {{ python }} + - python - cython >=0.25 - - numpy {{ numpy }} + - numpy 1.11 - mako - setuptools - libgpuarray =={{ version }} run: - - python {{ python }} - - {{ pin_compatible('numpy') }} + - python + - {{ pin_compatible('numpy', '1.11') }} - mako - six - libgpuarray =={{ version }} diff --git a/doc/conf.py b/doc/conf.py index 7768f2ed3b..cbb0a7c135 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -17,6 +17,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath('..')) +import versioneer # -- General configuration ----------------------------------------------------- @@ -56,10 +57,10 @@ # |version| and |release|, also used in various other places throughout the # built documents. # -# The short X.Y version. -version = '0.7' # The full version, including alpha/beta/rc tags. -release = '0.7.0' +release = versioneer.get_version() +# The short X.Y version. +version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/release.txt b/release.txt index 1d5922fe14..913e40151f 100644 --- a/release.txt +++ b/release.txt @@ -1,9 +1,5 @@ Release process: - Make sure you are on the proper release branch -- Update the version in setup.py -- Update the version in doc/conf.py -- Update the version in .appveyor.yml -- Update the version in conda/{libgpuarray,pygpu}/meta.yaml - Commit the changes with message "Changes for release X.Y.Z" git commit -m "Changes for release X.Y.Z" - Make a git tag From 97fac8483d7456aea98e554d447c2b25dbc684a5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:07:52 -0400 Subject: [PATCH 497/597] Go back to build matrix for appveyor. --- .appveyor.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 6d2b0e0b3a..984103da75 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -6,6 +6,8 @@ pull_requests: platform: - x64 +image: Visual Studio 2015 + clone_folder: C:\projects\libgpuarray configuration: @@ -14,6 +16,8 @@ configuration: environment: matrix: - CONDA_LOC: "C:\\Miniconda-x64" + - CONDA_LOC: "C:\\Miniconda35-x64" + - CONDA_LOC: "C:\\Miniconda36-x64" install: # This breaks conda-build because of git From c815a9ad262ea847140175953741cfd90b631538 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:19:13 -0400 Subject: [PATCH 498/597] Set the version for conda packages automatically. --- .appveyor.yml | 3 ++- conda/libgpuarray/meta.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 984103da75..ddacffd0f1 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -29,6 +29,7 @@ install: build: script build_script: - - conda build conda + - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i + - cmd: conda build conda #deploy_script: diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index f5069561dc..9c651b8277 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -1,6 +1,6 @@ package: name: libgpuarray - version: {{ environ.get('GIT_DESCRIBE_TAG')[1:] }} + version: {{ environ.get('GPUARRAY_VERSION') }} source: path: ../../ From 3e7f386b3c5e6ee20a42f25aa9db98d852afa4a8 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:32:55 -0400 Subject: [PATCH 499/597] Beginings of upload stuff + fix for build errors. --- .appveyor.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index ddacffd0f1..0604a96ef8 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -26,10 +26,14 @@ install: - cmd: set PYTHONUNBUFFERED=1 - cmd: conda install -n root --yes conda conda-env conda-build anaconda-client -build: script +build: off -build_script: +test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i - cmd: conda build conda -#deploy_script: +deploy_script: + - cmd: | + git describe --exact-match HEAD + if errorlevel 1 exit 0 + echo "anaconda upload" From 438aeadf227be96d99e83fbefc5ff223882abd25 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:35:06 -0400 Subject: [PATCH 500/597] Fix parse problem. --- .appveyor.yml | 5 +---- .ci-support/upload.bat | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 .ci-support/upload.bat diff --git a/.appveyor.yml b/.appveyor.yml index 0604a96ef8..68631cd88f 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -33,7 +33,4 @@ test_script: - cmd: conda build conda deploy_script: - - cmd: | - git describe --exact-match HEAD - if errorlevel 1 exit 0 - echo "anaconda upload" + - cmd: .ci-support/upload.bat diff --git a/.ci-support/upload.bat b/.ci-support/upload.bat new file mode 100644 index 0000000000..a7197182da --- /dev/null +++ b/.ci-support/upload.bat @@ -0,0 +1,3 @@ +git describe --exact-match HEAD +if errorlevel 1 exit 0 +echo "anaconda upload" \ No newline at end of file From cf3361b36da0532afad714c7e26e184642468c5e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:41:58 -0400 Subject: [PATCH 501/597] Set the MSVC paths manually since conda doesn't look for "VC++ for python 2008" --- .appveyor.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 68631cd88f..7dcee47a45 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -16,8 +16,9 @@ configuration: environment: matrix: - CONDA_LOC: "C:\\Miniconda-x64" + VS_PATH: "C:\\Users\\appveyor\\AppData\\Local\\Programs\\Common\\Microsoft\\Visual C++ for Python\\9.0" - CONDA_LOC: "C:\\Miniconda35-x64" - - CONDA_LOC: "C:\\Miniconda36-x64" + VS_PATH: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC" install: # This breaks conda-build because of git @@ -30,6 +31,7 @@ build: off test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i + - cmd: "%VS_PATH%"\vcvarsall x64 - cmd: conda build conda deploy_script: From b909a816b6837a2c8b4231e58a54bc05e5585f9c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:43:21 -0400 Subject: [PATCH 502/597] Fix pygpu version. --- conda/pygpu/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index 0c20677e02..2876889eab 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -1,4 +1,4 @@ -{% set version = environ.get('GIT_DESCRIBE_TAG')[1:] %} +{% set version = environ.get('GPUARRAY_VERSION') %} package: name: pygpu From 0f363e12755bdae77114ed9a1f7608462fe8654c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:46:33 -0400 Subject: [PATCH 503/597] Fix syntax? --- .appveyor.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 7dcee47a45..70f6bd549b 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -31,7 +31,9 @@ build: off test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i - - cmd: "%VS_PATH%"\vcvarsall x64 + - cd %VS_PATH% + - vcvarsall x64 + - cd C:\projects\libgpuarray - cmd: conda build conda deploy_script: From 61785cb381c5888bad383c8cf99daaf0f9ad1d6e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 14:55:19 -0400 Subject: [PATCH 504/597] Fix cmake not picking up the right compiler. --- conda/libgpuarray/bld.bat | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda/libgpuarray/bld.bat b/conda/libgpuarray/bld.bat index 8989a9c032..65b9509101 100644 --- a/conda/libgpuarray/bld.bat +++ b/conda/libgpuarray/bld.bat @@ -1,4 +1,5 @@ -cmake -DCMAKE_BUILD_TYPE=Release ^ +cmake -G "%CMAKE_GENERATOR%" ^ + -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^ From 94461172b8af48a8937705cf5165533216128e89 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 15:15:00 -0400 Subject: [PATCH 505/597] Fix brokeness with vs2008 through a conda-forge trick. --- .appveyor.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 70f6bd549b..997cc966e7 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -16,9 +16,7 @@ configuration: environment: matrix: - CONDA_LOC: "C:\\Miniconda-x64" - VS_PATH: "C:\\Users\\appveyor\\AppData\\Local\\Programs\\Common\\Microsoft\\Visual C++ for Python\\9.0" - CONDA_LOC: "C:\\Miniconda35-x64" - VS_PATH: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC" install: # This breaks conda-build because of git @@ -26,14 +24,15 @@ install: - cmd: call %CONDA_LOC%\Scripts\activate.bat - cmd: set PYTHONUNBUFFERED=1 - cmd: conda install -n root --yes conda conda-env conda-build anaconda-client + # We borrow a trick from conda-forge to fix the VS2008 compiler + - cmd: conda config --append channels conda-forge + - cmd: conda install --yes vs2008_express_vc_python_patch + - cmd: call setup_x64 build: off test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i - - cd %VS_PATH% - - vcvarsall x64 - - cd C:\projects\libgpuarray - cmd: conda build conda deploy_script: From 728ea1db11535cb82259f744138ecb9768e56a65 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 15:25:17 -0400 Subject: [PATCH 506/597] Remove unecessary stuff. --- conda/libgpuarray/bld.bat | 1 - conda/libgpuarray/meta.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/conda/libgpuarray/bld.bat b/conda/libgpuarray/bld.bat index 65b9509101..e4c7f38b35 100644 --- a/conda/libgpuarray/bld.bat +++ b/conda/libgpuarray/bld.bat @@ -1,5 +1,4 @@ cmake -G "%CMAKE_GENERATOR%" ^ - -DCMAKE_BUILD_TYPE=Release ^ -DCMAKE_PREFIX_PATH="%LIBRARY_PREFIX%" ^ -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ -DCMAKE_C_FLAGS="-I%LIBRARY_PREFIX%\include" ^ diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index 9c651b8277..bf18c04109 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -15,7 +15,6 @@ build: requirements: build: - m2-git [win] - - m2-filesystem [win] - git [not win] - cmake - mako From 25d623f43e5067b0894b9a1c4884fedf416ea4dc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 1 Sep 2017 15:59:43 -0400 Subject: [PATCH 507/597] Try to save the built packages. --- .appveyor.yml | 12 ++++++++++-- .ci-support/upload.bat | 3 --- 2 files changed, 10 insertions(+), 5 deletions(-) delete mode 100644 .ci-support/upload.bat diff --git a/.appveyor.yml b/.appveyor.yml index 997cc966e7..a6489f589b 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -34,6 +34,14 @@ build: off test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i - cmd: conda build conda + - cmd: mkdir pkgs + - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\pygpu* pkgs\ /Y + - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y -deploy_script: - - cmd: .ci-support/upload.bat +artifacts: + - path: pkgs/* + name: "Conda Packages" + +#deploy: +# on: +# appveyor_repo_tag: true diff --git a/.ci-support/upload.bat b/.ci-support/upload.bat deleted file mode 100644 index a7197182da..0000000000 --- a/.ci-support/upload.bat +++ /dev/null @@ -1,3 +0,0 @@ -git describe --exact-match HEAD -if errorlevel 1 exit 0 -echo "anaconda upload" \ No newline at end of file From 78c7b1845527385475256fc10f4649337020abd1 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Tue, 5 Sep 2017 12:46:24 -0400 Subject: [PATCH 508/597] Re-add a `version` module to pygpu. --- .gitignore | 2 +- pygpu/version.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 pygpu/version.py diff --git a/.gitignore b/.gitignore index 72c02cdaef..8f181a3726 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ build Debug Release lib +.idea .*.sw[po] *~ *.pyc @@ -22,7 +23,6 @@ doc/_build doc/_doxybuild pygpu/*.c pygpu/*.h -pygpu/version.py src/gpuarray/abi_version.h src/private_config.h Makefile.conf diff --git a/pygpu/version.py b/pygpu/version.py new file mode 100644 index 0000000000..7312d89a09 --- /dev/null +++ b/pygpu/version.py @@ -0,0 +1,36 @@ +# Runtime code to infer version infos +# from ``_version`` module generated by versioneer. +from __future__ import absolute_import, print_function, division + +from ._version import get_versions + +major = None +minor = None +patch = None +suffix = None +fullversion = None + + +def _extract_version(): + global major, minor, patch, suffix, fullversion + version_info = get_versions() + fullversion = version_info['version'] + version_error = version_info['error'] + if version_error is not None: + raise ImportError('Unable to get pygpu version') + version_pieces = fullversion.split('.', 2) + if len(version_pieces) != 3: + raise ImportError('Unable to parse pygpu version string') + version_patch_pieces = version_pieces[2].split('+', 1) + if len(version_patch_pieces) not in (1, 2): + raise ImportError('Unable to parse pygpu version patch') + major = int(version_pieces[0]) + minor = int(version_pieces[1]) + patch = int(version_patch_pieces[0]) + suffix = '' + if len(version_patch_pieces) == 2: + suffix = '+' + version_patch_pieces[1] + +_extract_version() +del get_versions +del _extract_version From a599aa34c285719fd06b3dc7541bf23344d613f1 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Tue, 5 Sep 2017 16:36:01 -0400 Subject: [PATCH 509/597] Dummy commit to test appveyor --- .appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.appveyor.yml b/.appveyor.yml index a6489f589b..1428368057 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -33,6 +33,7 @@ build: off test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i + - cmd: echo %GPUARRAY_VERSION% - cmd: conda build conda - cmd: mkdir pkgs - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\pygpu* pkgs\ /Y From 30aedbd7fb36b2535a7bdc215e61744f60378be5 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Tue, 5 Sep 2017 16:36:01 -0400 Subject: [PATCH 510/597] Apply workaround to make versioneer work on Appveyor --- .gitignore | 1 + pygpu/version.py | 36 ------------------------------------ versioneer.py | 3 +-- 3 files changed, 2 insertions(+), 38 deletions(-) delete mode 100644 pygpu/version.py diff --git a/.gitignore b/.gitignore index 8f181a3726..143674ab00 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ doc/_build doc/_doxybuild pygpu/*.c pygpu/*.h +pygpu/version.py src/gpuarray/abi_version.h src/private_config.h Makefile.conf diff --git a/pygpu/version.py b/pygpu/version.py deleted file mode 100644 index 7312d89a09..0000000000 --- a/pygpu/version.py +++ /dev/null @@ -1,36 +0,0 @@ -# Runtime code to infer version infos -# from ``_version`` module generated by versioneer. -from __future__ import absolute_import, print_function, division - -from ._version import get_versions - -major = None -minor = None -patch = None -suffix = None -fullversion = None - - -def _extract_version(): - global major, minor, patch, suffix, fullversion - version_info = get_versions() - fullversion = version_info['version'] - version_error = version_info['error'] - if version_error is not None: - raise ImportError('Unable to get pygpu version') - version_pieces = fullversion.split('.', 2) - if len(version_pieces) != 3: - raise ImportError('Unable to parse pygpu version string') - version_patch_pieces = version_pieces[2].split('+', 1) - if len(version_patch_pieces) not in (1, 2): - raise ImportError('Unable to parse pygpu version patch') - major = int(version_pieces[0]) - minor = int(version_pieces[1]) - patch = int(version_patch_pieces[0]) - suffix = '' - if len(version_patch_pieces) == 2: - suffix = '+' + version_patch_pieces[1] - -_extract_version() -del get_versions -del _extract_version diff --git a/versioneer.py b/versioneer.py index 64fea1c892..38b0ba6c74 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1047,8 +1047,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], + "--always", "--long"], cwd=root) # --long was added in git-1.5.5 if describe_out is None: From 1097dc24bf85148f7016f8fea68d43c09144cc42 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Wed, 6 Sep 2017 13:16:48 -0400 Subject: [PATCH 511/597] Restrict workaround to Windows only. --- versioneer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/versioneer.py b/versioneer.py index 38b0ba6c74..b303a70909 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1046,9 +1046,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long"], - cwd=root) + if sys.platform == 'win32': + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long"], + cwd=root) + else: + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") From 9780df98e113fb4d462dfddd0207d840b4163ad4 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Fri, 8 Sep 2017 16:42:44 -0400 Subject: [PATCH 512/597] Try to fix theano error related to sger. --- src/gpuarray_blas_cuda_cublas.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 49a543d8eb..96e8830a80 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -1636,9 +1636,9 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha, for (i = 0; i < batchCount; i++) { - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_ALL)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_READ)); } cuda_exit(ctx); From ca8613bafef27912d0e3e365a58161a72c5a04d7 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Fri, 8 Sep 2017 16:59:13 -0400 Subject: [PATCH 513/597] Apply fix for dgerBatch --- src/gpuarray_blas_cuda_cublas.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 96e8830a80..69d68fd71b 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -1766,9 +1766,9 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, } for (i = 0; i < batchCount; i++) { - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_ALL)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(x[i], CUDA_WAIT_READ)); - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(y[i], CUDA_WAIT_READ)); } cuda_exit(ctx); From 5b177ce10ea47ebc429ea25ba4d8f2dced8a0aa6 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Mon, 11 Sep 2017 12:37:30 -0400 Subject: [PATCH 514/597] Test another possible fix for versionning --- conda/pygpu/meta.yaml | 3 --- versioneer.py | 13 ++++--------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/conda/pygpu/meta.yaml b/conda/pygpu/meta.yaml index 2876889eab..716783b22d 100644 --- a/conda/pygpu/meta.yaml +++ b/conda/pygpu/meta.yaml @@ -13,9 +13,6 @@ build: requirements: build: - - m2-git [win] - - m2-filesystem [win] - - git [not win] - python - cython >=0.25 - numpy 1.11 diff --git a/versioneer.py b/versioneer.py index b303a70909..64fea1c892 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1046,15 +1046,10 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - if sys.platform == 'win32': - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long"], - cwd=root) - else: - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") From d98f261304ba71a810c48076af54128efc27b187 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 12:45:02 -0400 Subject: [PATCH 515/597] Fix parameter documentation. --- src/gpuarray/buffer.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index caab56fa05..e8b06cca97 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -170,6 +170,7 @@ GPUARRAY_PUBLIC int gpucontext_props_set_single_stream(gpucontext_props *p); * shared drives. * * \param p properties object + * \param path desired location of the kernel cache * * \returns GA_NO_ERROR or an error code if an error occurred. */ @@ -214,8 +215,6 @@ GPUARRAY_PUBLIC void gpucontext_props_del(gpucontext_props *p); * * \param res a pointer to a location that will be allocated * \param name the backend name. - * \param dev the device number. The precise meaning of the device - * number is backend-dependent * \param props a properties object for the context. Can be NULL for * defaults. * From e713fe0e8ed98a84c909399beab8b5b6d1c71bef Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 13:09:44 -0400 Subject: [PATCH 516/597] Fix doc building with version from versioneer. --- doc/conf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/conf.py b/doc/conf.py index cbb0a7c135..b4b8573a07 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -57,10 +57,17 @@ # |version| and |release|, also used in various other places throughout the # built documents. # + +# We need this hokey-pokey because versioneer needs the current +# directory to be the root of the project to work. +_curpath = os.getcwd() +os.chdir(os.path.dirname(os.path.dirname(__file__))) # The full version, including alpha/beta/rc tags. release = versioneer.get_version() # The short X.Y version. version = '.'.join(release.split('.')[:2]) +os.chdir(_curpath) +del _curpath # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 9a57bbdd1dd7cc7d35750a860e3aec8eeca78d3b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 13:10:03 -0400 Subject: [PATCH 517/597] Fix warnings that have crept up. --- doc/c_api/group/group__context__flags.rst | 5 ----- doc/conf.py | 4 ---- 2 files changed, 9 deletions(-) delete mode 100644 doc/c_api/group/group__context__flags.rst diff --git a/doc/c_api/group/group__context__flags.rst b/doc/c_api/group/group__context__flags.rst deleted file mode 100644 index 7833253c3f..0000000000 --- a/doc/c_api/group/group__context__flags.rst +++ /dev/null @@ -1,5 +0,0 @@ -Group context_flags -=================== - -.. doxygengroup:: context_flags - :no-link: diff --git a/doc/conf.py b/doc/conf.py index b4b8573a07..0d661ba3ee 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -151,10 +151,6 @@ def setup(app): # using the given strftime format. html_last_updated_fmt = '%b %d, %Y' -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -html_use_smartypants = True - # Custom sidebar templates, maps document names to template names. #html_sidebars = {} From 9b0413eabcb99d031a22c227c4c22fbcfeb7ce4f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 13:38:36 -0400 Subject: [PATCH 518/597] Update the version of sphinx. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8df5a13de5..9fd654483f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,7 +23,7 @@ before_install: - export PREFIX=$HOME/.local - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install doxygen; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PYTHONUSERBASE=$PREFIX; fi - - pip install --user breathe sphinx==1.5.1 sphinx_rtd_theme cython numpy 'mako>=0.7' six + - pip install --user breathe sphinx==1.6.3 sphinx_rtd_theme cython numpy 'mako>=0.7' six - export PATH=$PATH:$PREFIX/bin - export CPATH=$CPATH:$PREFIX/include - export LIBRARY_PATH=$LIBRARY_PATH:$PREFIX/lib From d343bd02eb0f0a6d4f994a64d05eeb65b857126f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 15:26:30 -0400 Subject: [PATCH 519/597] First attempt at upload script. --- .appveyor.yml | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 1428368057..e3868fc67e 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -14,6 +14,10 @@ configuration: - Release environment: + BINSTAR_TOKEN: + secure: pPGGv/pzNILDCClm4VNOv9lJ9Bah+iuJ22j4oLCaHzM8eNre8bfALOhMRIERhd55 + #secure: jK8icdT9ukloE7xSj7fqiCmtM/aTNLTUEeD2HLRP5+9GI5oGkK4jt/uYEcKDtfwO + matrix: - CONDA_LOC: "C:\\Miniconda-x64" - CONDA_LOC: "C:\\Miniconda35-x64" @@ -35,14 +39,8 @@ test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i - cmd: echo %GPUARRAY_VERSION% - cmd: conda build conda - - cmd: mkdir pkgs - - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\pygpu* pkgs\ /Y - - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y - -artifacts: - - path: pkgs/* - name: "Conda Packages" - -#deploy: -# on: -# appveyor_repo_tag: true + - ps: | + if($env:appveyor_repo_tag -eq 'False') { + anaconda upload --user=abergeron "%CONDA_LOC%"\conda-bld\win-64\pygpu* + anaconda upload --user=abergeron "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* + } From 872101459654c390dd4c18d01dbd9ccb0c7a34d0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 15:39:43 -0400 Subject: [PATCH 520/597] We don't need git to build. --- conda/libgpuarray/meta.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/conda/libgpuarray/meta.yaml b/conda/libgpuarray/meta.yaml index bf18c04109..a3914e1b17 100644 --- a/conda/libgpuarray/meta.yaml +++ b/conda/libgpuarray/meta.yaml @@ -14,8 +14,6 @@ build: requirements: build: - - m2-git [win] - - git [not win] - cmake - mako - python # version doesn't matter here From 22819c2156959587bea8637b3b64828253282a44 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 15:46:37 -0400 Subject: [PATCH 521/597] Fix variable substitution for powershell. --- .appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index e3868fc67e..a717fcc4e4 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -41,6 +41,6 @@ test_script: - cmd: conda build conda - ps: | if($env:appveyor_repo_tag -eq 'False') { - anaconda upload --user=abergeron "%CONDA_LOC%"\conda-bld\win-64\pygpu* - anaconda upload --user=abergeron "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* + anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu* + anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray* } From e012eccc66bf6468e1317d830d3b58c73b6c12d0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 16:09:50 -0400 Subject: [PATCH 522/597] Powershell doesn't like "shell" commands. --- .appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index a717fcc4e4..f9f6398860 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -41,6 +41,6 @@ test_script: - cmd: conda build conda - ps: | if($env:appveyor_repo_tag -eq 'False') { - anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu* - anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray* + cmd.exe /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu*" + cmd.exe /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray*" } From 17a5fc1c63ee1b6b02018df138ddd4a87b9d600c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 16:52:59 -0400 Subject: [PATCH 523/597] Redirect stderr to stdout so that powershell stops being annoying. --- .appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index f9f6398860..3aba5643fc 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -41,6 +41,6 @@ test_script: - cmd: conda build conda - ps: | if($env:appveyor_repo_tag -eq 'False') { - cmd.exe /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu*" - cmd.exe /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray*" + cmd /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu* 2>&1" + cmd /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray* 2>&1" } From 03104b23b03bdec57c9bd915c5596af0d2bca3ad Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 17:53:32 -0400 Subject: [PATCH 524/597] New token with the right permission and acutally pass it in. --- .appveyor.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 3aba5643fc..0bf551e1f7 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -15,7 +15,7 @@ configuration: environment: BINSTAR_TOKEN: - secure: pPGGv/pzNILDCClm4VNOv9lJ9Bah+iuJ22j4oLCaHzM8eNre8bfALOhMRIERhd55 + secure: znS5Hm/opLtSmNYpJON5sIX0wa+1rM+UnhfonkTKVMC4Lr8aRIHsD4YAz9hKjWXr #secure: jK8icdT9ukloE7xSj7fqiCmtM/aTNLTUEeD2HLRP5+9GI5oGkK4jt/uYEcKDtfwO matrix: @@ -41,6 +41,6 @@ test_script: - cmd: conda build conda - ps: | if($env:appveyor_repo_tag -eq 'False') { - cmd /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu* 2>&1" - cmd /c "anaconda upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray* 2>&1" + cmd /c "anaconda -t $BINSTAR_TOKEN upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu* 2>&1" + cmd /c "anaconda -t $BINSTAR_TOKEN upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray* 2>&1" } From 590b18eae9661dade01aff3e9ebff5cffbe6e0d3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 18:00:04 -0400 Subject: [PATCH 525/597] Add a build for python 3.6 and restore the artifacts so that we can do a manual upload if something goes wrong. --- .appveyor.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 0bf551e1f7..4b203cf1c9 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -21,6 +21,7 @@ environment: matrix: - CONDA_LOC: "C:\\Miniconda-x64" - CONDA_LOC: "C:\\Miniconda35-x64" + - CONDA_LOC: "C:\\Miniconda36-x64" install: # This breaks conda-build because of git @@ -39,8 +40,14 @@ test_script: - cmd: for /f "tokens=*" %%i in ('python -c "import versioneer; print(versioneer.get_version())"') do set GPUARRAY_VERSION=%%i - cmd: echo %GPUARRAY_VERSION% - cmd: conda build conda + - cmd: mkdir pkgs + - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\pygpu* pkgs\ /Y + - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y - ps: | if($env:appveyor_repo_tag -eq 'False') { - cmd /c "anaconda -t $BINSTAR_TOKEN upload --user=abergeron $CONDA_LOC\conda-bld\win-64\pygpu* 2>&1" - cmd /c "anaconda -t $BINSTAR_TOKEN upload --user=abergeron $CONDA_LOC\conda-bld\win-64\libgpuarray* 2>&1" + cmd /c "anaconda -t $BINSTAR_TOKEN upload --user=abergeron pkgs/* 2>&1" } + +artifacts: + - path: pkgs/* + name: "Conda Packages" From 8b9638653ffbd0a58b9a7bacb43664585b2db667 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 18:02:18 -0400 Subject: [PATCH 526/597] Get rid of the cmd wrapper since we need substitution from powershell --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 4b203cf1c9..88d23abb6c 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -45,7 +45,7 @@ test_script: - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y - ps: | if($env:appveyor_repo_tag -eq 'False') { - cmd /c "anaconda -t $BINSTAR_TOKEN upload --user=abergeron pkgs/* 2>&1" + anaconda -t $BINSTAR_TOKEN upload --user=abergeron pkgs/* 2>&1 } artifacts: From 56a4089621d18b60e7ed9390bbd758a912dbf63a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 18:10:35 -0400 Subject: [PATCH 527/597] Fix variable expansion for PowerShell. --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 88d23abb6c..129e5c1847 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -45,7 +45,7 @@ test_script: - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y - ps: | if($env:appveyor_repo_tag -eq 'False') { - anaconda -t $BINSTAR_TOKEN upload --user=abergeron pkgs/* 2>&1 + anaconda -t $env:BINSTAR_TOKEN upload --user=abergeron pkgs/* 2>&1 } artifacts: From be7a3dca913f63ae6fb51ac9f278d5943e8459b4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 11 Sep 2017 18:15:23 -0400 Subject: [PATCH 528/597] Switch the token to mila-udem and make it the target user. --- .appveyor.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 129e5c1847..c3d999f986 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -15,8 +15,7 @@ configuration: environment: BINSTAR_TOKEN: - secure: znS5Hm/opLtSmNYpJON5sIX0wa+1rM+UnhfonkTKVMC4Lr8aRIHsD4YAz9hKjWXr - #secure: jK8icdT9ukloE7xSj7fqiCmtM/aTNLTUEeD2HLRP5+9GI5oGkK4jt/uYEcKDtfwO + secure: 4KDgW9K3omzJ/ILWm1ApjsvTjefHJuECRy7nmnvuUul+0gbAYTUm5JanY+X1pccu matrix: - CONDA_LOC: "C:\\Miniconda-x64" @@ -44,8 +43,8 @@ test_script: - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\pygpu* pkgs\ /Y - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y - ps: | - if($env:appveyor_repo_tag -eq 'False') { - anaconda -t $env:BINSTAR_TOKEN upload --user=abergeron pkgs/* 2>&1 + if($env:appveyor_repo_tag -eq 'True') { + anaconda -t $env:BINSTAR_TOKEN upload --user=mila-udem pkgs/* 2>&1 } artifacts: From 91d91776320005d2df71a57be2f50bee1ad9b7ef Mon Sep 17 00:00:00 2001 From: Dendi Suhubdy Date: Mon, 11 Sep 2017 23:59:02 -0400 Subject: [PATCH 529/597] fixing gpuarray_array_blas.c, C_OFFSET value comparison with NULL, fixing GpuArrayException: ('malloc: Resource temporarily unavailable', 6) @shawntan --- src/gpuarray_array_blas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 3eb1d0bd38..36b5e66d2b 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -663,7 +663,7 @@ int GpuArray_rgemmBatch_3d(cb_transpose transA, cb_transpose transB, double alph C_offsets = (size_t*)malloc(batchCount * sizeof(size_t)); if (A_datas == NULL || B_datas == NULL || C_datas == NULL || - A_offsets == NULL || B_offsets == NULL || C_offsets) { + A_offsets == NULL || B_offsets == NULL || C_offsets == NULL) { err = error_sys(ctx->err, "malloc"); goto old_cleanup; } From 414fb54c22f59fa0daa23149f645c93e00ea26db Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 12 Sep 2017 13:56:26 -0400 Subject: [PATCH 530/597] Add a CircleCI build config. --- .circleci/config.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000..f9452edbe5 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,34 @@ +version: 2 + +jobs: + build: + docker: + - image: milaudem/libgpuarray:0 + + steps: + - checkout + - run: + name: "Checkout Merge Commit" + command: | + if [[ -n "${CIRCLE_PR_NUMBER}" ]] + then + git fetch -u origin "+refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" + git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" + fi + - run: + name: "Build Recipe" + command: | + export GPUARRAY_VERSION=`python -c 'import versioneer; print(versioneer.get_version())'` + conda build --python 2.7 conda + conda build --python 3.5 conda/pygpu + conda build --python 3.6 conda/pygpu + - run: + name: "Upload Tagged Versions" + command: | + if [[ -n "${CIRCLE_TAG}" ]] + then + anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/libgpuarray* + anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/pygpu* + fi + - store_artifacts: + path: /miniconda/conda-bld/linux-64 \ No newline at end of file From 8a97c1a821486637e6ae6780b612974fc4bfbafd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 12 Sep 2017 14:19:07 -0400 Subject: [PATCH 531/597] Fix typo. --- src/loaders/dyn_load.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loaders/dyn_load.c b/src/loaders/dyn_load.c index 2ea2f331d7..08442ed1bf 100644 --- a/src/loaders/dyn_load.c +++ b/src/loaders/dyn_load.c @@ -19,7 +19,7 @@ void *ga_load_library(const char *name, error *e) { void *ga_func_ptr(void *h, const char *name, error *e) { void *res = dlsym(h, name); if (res == NULL) - error_fmt(e, GA_LOAD_ERROR, "Could not find synbol \"%s\": %s", name, dlerror()); + error_fmt(e, GA_LOAD_ERROR, "Could not find symbol \"%s\": %s", name, dlerror()); return res; } From e51f2a65edc8622be8b0db656d521b8f9108d2bd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 12 Sep 2017 14:25:26 -0400 Subject: [PATCH 532/597] Add missing part of the config. --- .circleci/config.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f9452edbe5..c43cd1b415 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -31,4 +31,10 @@ jobs: anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/pygpu* fi - store_artifacts: - path: /miniconda/conda-bld/linux-64 \ No newline at end of file + path: /miniconda/conda-bld/linux-64 + +workflows: + version: 2 + build_and_test: + jobs: + - build From fb0fab2cae810f43e2adbbe77082872dc263763c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 12 Sep 2017 14:56:49 -0400 Subject: [PATCH 533/597] CircleCI seems confused about the 'build' name. --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c43cd1b415..40a08cd274 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,7 +1,7 @@ version: 2 jobs: - build: + build_pkgs: docker: - image: milaudem/libgpuarray:0 @@ -37,4 +37,4 @@ workflows: version: 2 build_and_test: jobs: - - build + - build_pkgs From 1a453bb2806ebf513a283ae919259aafb9ae289d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 12 Sep 2017 15:03:33 -0400 Subject: [PATCH 534/597] Fix typo. --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 40a08cd274..a8e0157723 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -30,8 +30,8 @@ jobs: anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/libgpuarray* anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/pygpu* fi - - store_artifacts: - path: /miniconda/conda-bld/linux-64 + - store_artifacts: + path: /miniconda/conda-bld/linux-64 workflows: version: 2 From 38d0dc72b1c09df4d62156ec3f6cc6ffe234058f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 12 Sep 2017 17:06:51 -0400 Subject: [PATCH 535/597] Fix indentation. --- .circleci/config.yml | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a8e0157723..c5cf06c10e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,30 +8,30 @@ jobs: steps: - checkout - run: - name: "Checkout Merge Commit" - command: | - if [[ -n "${CIRCLE_PR_NUMBER}" ]] - then - git fetch -u origin "+refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" - git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" - fi + name: "Checkout Merge Commit" + command: | + if [[ -n "${CIRCLE_PR_NUMBER}" ]] + then + git fetch -u origin "+refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" + git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" + fi - run: - name: "Build Recipe" - command: | - export GPUARRAY_VERSION=`python -c 'import versioneer; print(versioneer.get_version())'` - conda build --python 2.7 conda - conda build --python 3.5 conda/pygpu - conda build --python 3.6 conda/pygpu + name: "Build Recipe" + command: | + export GPUARRAY_VERSION=`python -c 'import versioneer; print(versioneer.get_version())'` + conda build --python 2.7 conda + conda build --python 3.5 conda/pygpu + conda build --python 3.6 conda/pygpu - run: - name: "Upload Tagged Versions" - command: | - if [[ -n "${CIRCLE_TAG}" ]] - then - anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/libgpuarray* - anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/pygpu* - fi + name: "Upload Tagged Versions" + command: | + if [[ -n "${CIRCLE_TAG}" ]] + then + anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/libgpuarray* + anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/pygpu* + fi - store_artifacts: - path: /miniconda/conda-bld/linux-64 + path: /miniconda/conda-bld/linux-64 workflows: version: 2 From 2bbbcbe1d45bfa16bf750626b1f8151c02b62d8e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 12 Sep 2017 17:14:48 -0400 Subject: [PATCH 536/597] Remove linux testing on travis. --- .travis.yml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9fd654483f..b241639a03 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,27 +2,13 @@ language: c matrix: include: - - os: linux - compiler: gcc - - os: linux - compiler: clang - os: osx compiler: clang -addons: - apt: - sources: - - kalakris-cmake - - george-edison55-precise-backports - packages: - - cmake - - cmake-data - - doxygen - before_install: - export PREFIX=$HOME/.local - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install doxygen; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PYTHONUSERBASE=$PREFIX; fi + - brew update && brew install doxygen; fi + - export PYTHONUSERBASE=$PREFIX; fi - pip install --user breathe sphinx==1.6.3 sphinx_rtd_theme cython numpy 'mako>=0.7' six - export PATH=$PATH:$PREFIX/bin - export CPATH=$CPATH:$PREFIX/include From edf31c8aaed054c4a5fc278e8efe70b6484b417f Mon Sep 17 00:00:00 2001 From: notoraptor Date: Tue, 12 Sep 2017 15:42:27 -0400 Subject: [PATCH 537/597] Try Try to fix some memory leaks. --- pygpu/gpuarray.pyx | 3 +++ src/cache/disk.c | 11 +++++++++-- src/gpuarray_elemwise.c | 18 ++++++++++++------ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 44608e1e4f..f582e05c4a 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1428,6 +1428,7 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, tot *= newdims[i] cdims = calloc(nd, sizeof(size_t)) if cdims == NULL: + free(cdims) raise MemoryError, "could not allocate cdims" cdef size_t d @@ -1437,10 +1438,12 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, d = a.size // tot if d * tot != a.size: + free(cdims) raise GpuArrayException, "..." cdims[i] = d array_reshape(res, a, nd, cdims, ord, nocopy) + free(cdims) return res diff --git a/src/cache/disk.c b/src/cache/disk.c index 6fda751b30..0f9de82e0a 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -216,8 +216,15 @@ static int key_path(disk_cache *c, const cache_key_t key, char *out) { unsigned char hash[64]; int i; - if (c->kwrite(&kb, key)) return -1; - if (Skein_512((unsigned char *)kb.s, kb.l, hash)) return -1; + if (c->kwrite(&kb, key)) { + strb_clear(&kb); + return -1; + } + if (Skein_512((unsigned char *)kb.s, kb.l, hash)) { + strb_clear(&kb); + return -1; + } + strb_clear(&kb); if (snprintf(out, 10, "%02x%02x/%02x%02x", hash[0], hash[1], hash[2], hash[3]) != 9) return -1; diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index 3ba31d30fe..fa5b0efa2c 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -738,18 +738,24 @@ GpuElemwise *GpuElemwise_new(gpucontext *ctx, void GpuElemwise_free(GpuElemwise *ge) { unsigned int i; - for (i = 0; i < ge->nd; i++) { - if (k_initialized(&ge->k_basic_32[i])) - GpuKernel_clear(&ge->k_basic_32[i]); - if (k_initialized(&ge->k_basic[i])) - GpuKernel_clear(&ge->k_basic[i]); - } + if (ge->k_basic_32 != NULL) + for (i = 0; i < ge->nd; i++) { + if (k_initialized(&ge->k_basic_32[i])) + GpuKernel_clear(&ge->k_basic_32[i]); + } + if (ge->k_basic != NULL) + for (i = 0; i < ge->nd; i++) { + if (k_initialized(&ge->k_basic[i])) + GpuKernel_clear(&ge->k_basic[i]); + } if (ge->strides != NULL) for (i = 0; i < ge->narray; i++) { free(ge->strides[i]); } if (k_initialized(&ge->k_contig)) GpuKernel_clear(&ge->k_contig); + free(ge->k_basic_32); + free(ge->k_basic); free_args(ge->n, ge->args); free((void *)ge->preamble); free((void *)ge->expr); From 613ab5997c7b25895ed38e27c9bc7293fa05d318 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Wed, 13 Sep 2017 08:40:14 -0400 Subject: [PATCH 538/597] Try to fix another memory leak. --- src/gpuarray_buffer_cuda.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 9a99d749c9..99b9bbee31 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1223,6 +1223,8 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { GA_CHECK(make_bin(ctx, &ptx, bin, log)); + strb_clear(&ptx); + if (ctx->disk_cache) { pk = calloc(sizeof(disk_key), 1); if (pk == NULL) { @@ -1234,7 +1236,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { memcpy(pk, &k, DISK_KEY_MM); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { - error_sys(ctx->err, "strb_appendb"); + error_sys(ctx->err, "strb_appendb"); fprintf(stderr, "Error adding kernel to disk cache %s\n", ctx->err->msg); disk_free((cache_key_t)pk); @@ -1242,7 +1244,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { } cbin = strb_alloc(bin->l); if (cbin == NULL) { - error_sys(ctx->err, "strb_alloc"); + error_sys(ctx->err, "strb_alloc"); fprintf(stderr, "Error adding kernel to disk cache: %s\n", ctx->err->msg); disk_free((cache_key_t)pk); @@ -1250,7 +1252,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { } strb_appendb(cbin, bin); if (strb_error(cbin)) { - error_sys(ctx->err, "strb_appendb"); + error_sys(ctx->err, "strb_appendb"); fprintf(stderr, "Error adding kernel to disk cache %s\n", ctx->err->msg); disk_free((cache_key_t)pk); From 24b923c6cd73d6488c5bd8394ea297250bfc5d90 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 13 Sep 2017 14:21:05 -0400 Subject: [PATCH 539/597] Fix syntax on travis. --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index b241639a03..e8236b2f4e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,8 +7,8 @@ matrix: before_install: - export PREFIX=$HOME/.local - - brew update && brew install doxygen; fi - - export PYTHONUSERBASE=$PREFIX; fi + - brew update && brew install doxygen + - export PYTHONUSERBASE=$PREFIX - pip install --user breathe sphinx==1.6.3 sphinx_rtd_theme cython numpy 'mako>=0.7' six - export PATH=$PATH:$PREFIX/bin - export CPATH=$CPATH:$PREFIX/include From 8aa05368d0d1d4996cad5958082091441584330d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 13 Sep 2017 14:40:29 -0400 Subject: [PATCH 540/597] Enable build on tags for CircleCI. --- .circleci/config.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c5cf06c10e..46fd53b800 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -37,4 +37,7 @@ workflows: version: 2 build_and_test: jobs: - - build_pkgs + - build_pkgs: + filters: + tags: + only: /.*/ From 5fec128fbceb410578305a1576a869f4c768313f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 13 Sep 2017 14:50:12 -0400 Subject: [PATCH 541/597] Use the wrapper command since it avoid weirdness in powershell with the error stream. --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index c3d999f986..895cbb4a40 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -44,7 +44,7 @@ test_script: - cmd: xcopy "%CONDA_LOC%"\conda-bld\win-64\libgpuarray* pkgs\ /Y - ps: | if($env:appveyor_repo_tag -eq 'True') { - anaconda -t $env:BINSTAR_TOKEN upload --user=mila-udem pkgs/* 2>&1 + cmd /c "anaconda -t $env:BINSTAR_TOKEN upload --user=mila-udem pkgs/* 2>&1" } artifacts: From e3d4aa0b31dacfa62e0531d69782b439f7c68640 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Wed, 13 Sep 2017 14:59:10 -0400 Subject: [PATCH 542/597] Remove useless call to free. --- pygpu/gpuarray.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index f582e05c4a..293803771d 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1428,7 +1428,6 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, tot *= newdims[i] cdims = calloc(nd, sizeof(size_t)) if cdims == NULL: - free(cdims) raise MemoryError, "could not allocate cdims" cdef size_t d From 714f3a54e69e86ad7e71e22439bf0b06a328a2e9 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Wed, 13 Sep 2017 15:13:14 -0400 Subject: [PATCH 543/597] Manage free() with a try-catch block. --- pygpu/gpuarray.pyx | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 293803771d..94f9b36181 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1430,20 +1430,21 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, if cdims == NULL: raise MemoryError, "could not allocate cdims" - cdef size_t d - for i in range(nd): - d = newdims[i] - if i == caxis: - d = a.size // tot - - if d * tot != a.size: - free(cdims) - raise GpuArrayException, "..." - cdims[i] = d + try: + cdef size_t d + for i in range(nd): + d = newdims[i] + if i == caxis: + d = a.size // tot + + if d * tot != a.size: + raise GpuArrayException, "..." + cdims[i] = d - array_reshape(res, a, nd, cdims, ord, nocopy) - free(cdims) - return res + array_reshape(res, a, nd, cdims, ord, nocopy) + return res + finally: + free(cdims) cdef GpuArray pygpu_transpose(GpuArray a, const unsigned int *newaxes): From 06b2d4e1f2dcaf7438713284f64b4dee546efc6e Mon Sep 17 00:00:00 2001 From: notoraptor Date: Wed, 13 Sep 2017 15:30:12 -0400 Subject: [PATCH 544/597] Fix cython syntax --- pygpu/gpuarray.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 94f9b36181..a1e38bc4cf 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -1430,8 +1430,8 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims, if cdims == NULL: raise MemoryError, "could not allocate cdims" + cdef size_t d try: - cdef size_t d for i in range(nd): d = newdims[i] if i == caxis: From 4f164c551bf7d5f31d107d089451f40fac07f231 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 13 Sep 2017 16:28:22 -0400 Subject: [PATCH 545/597] Fix the release notes. --- release.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/release.txt b/release.txt index 913e40151f..65ba316dea 100644 --- a/release.txt +++ b/release.txt @@ -1,7 +1,5 @@ Release process: - Make sure you are on the proper release branch -- Commit the changes with message "Changes for release X.Y.Z" - git commit -m "Changes for release X.Y.Z" - Make a git tag git tag vX.Y.Z - Push to master the commit and the tag From 245afd1045999894f3be0a6acc480dea27a208ca Mon Sep 17 00:00:00 2001 From: Dendi Suhubdy Date: Wed, 13 Sep 2017 22:00:14 -0400 Subject: [PATCH 546/597] fixing returns 700 and then takes an exit path that causes to return 3. @obilaniu --- src/gpuarray_blas_cuda_cublas.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 69d68fd71b..08dc33c6d1 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -835,6 +835,9 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return ctx->err->code; } + // added cuda_wait + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_ALL)); + err = cublasSgemmBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, @@ -962,6 +965,9 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return ctx->err->code; } + // added cuda_wait + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_ALL)); + err = cublasDgemmBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, From 93493d83de9969fb75e9829f46b18ac9dd8c7318 Mon Sep 17 00:00:00 2001 From: Dendi Suhubdy Date: Thu, 14 Sep 2017 00:06:30 -0400 Subject: [PATCH 547/597] adding feature of python setup.py clean where it will delete the designated build files stated in .clean which is Build/ build/ and others --- .clean | 21 +++++++++++++++++++++ versioneer.py | 19 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 .clean diff --git a/.clean b/.clean new file mode 100644 index 0000000000..1c90c15b70 --- /dev/null +++ b/.clean @@ -0,0 +1,21 @@ +Build +build +Debug +Release +lib +__pycache__ +.idea +.*.sw[po] +*~ +*.pyc +*.pyd +*.pyo +*.egg-info +dist +setuptools*egg +setuptools.pth +distribute*egg +distribute*tar.gz +*.so +*.o +*.log diff --git a/versioneer.py b/versioneer.py index 64fea1c892..3d529153d7 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1647,6 +1647,25 @@ def make_release_tree(self, base_dir, files): self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist + import distutils.command.clean + import shutil + + class cmd_clean(distutils.command.clean.clean): + def run(self): + import glob + with open('.clean', 'r') as f: + ignores = f.read() + for wildcard in filter(bool, ignores.split('\n')): + for filename in glob.glob(wildcard): + try: + os.remove(filename) + except OSError: + shutil.rmtree(filename, ignore_errors=True) + + # It's an old-style class in Python 2.7... + distutils.command.clean.clean.run(self) + cmds["clean"] = cmd_clean + return cmds From 027019477e45cfdc656c92e4e08e9e71cc8fefe7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 14 Sep 2017 13:25:18 -0400 Subject: [PATCH 548/597] Blurb. --- release.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/release.txt b/release.txt index 65ba316dea..6e52ebfd9d 100644 --- a/release.txt +++ b/release.txt @@ -3,7 +3,9 @@ Release process: - Make a git tag git tag vX.Y.Z - Push to master the commit and the tag - git push --tags central master + git push --tags central master + This push will trigger package builds for windows and linux that will + be uploaded to the mila-udem conda channel. - Add a release on github with a tag in the form of 'vX.Y.Z' https://github.com/Theano/libgpuarray/releases/new - Make note of the major changes since the last release From bd9f5fa3b8c3a5ed5ed2be8ed341e635fb4e949d Mon Sep 17 00:00:00 2001 From: Dendi Suhubdy Date: Thu, 14 Sep 2017 15:42:47 -0400 Subject: [PATCH 549/597] Updating CUDA_WAIT_ALL to CUDA_WAIT_READ --- src/gpuarray_blas_cuda_cublas.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 08dc33c6d1..fb46d009ab 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -835,8 +835,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return ctx->err->code; } - // added cuda_wait - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_READ)); err = cublasSgemmBatched(h->h, convT(transA), convT(transB), @@ -965,8 +964,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return ctx->err->code; } - // added cuda_wait - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_ALL)); + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_READ)); err = cublasDgemmBatched(h->h, convT(transA), convT(transB), From 7c639b738f7132d891ad03d825ef2534f166d164 Mon Sep 17 00:00:00 2001 From: Dendi Suhubdy Date: Thu, 14 Sep 2017 16:05:53 -0400 Subject: [PATCH 550/597] getting the cmdclass from versioneer, adding the custom clean command and then passing it to setup() --- setup.py | 23 ++++++++++++++++++++++- versioneer.py | 20 -------------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 911a1540e9..b185e0aef5 100755 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ import sys import os import versioneer +import distutils.command.clean +import shutil have_cython = False @@ -82,6 +84,22 @@ def __init__(self, *args, **kwargs): raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir)) library_dirs += [default_bin_dir] +class cmd_clean(distutils.command.clean.clean): + def run(self): + import glob + with open('.clean', 'r') as f: + ignores = f.read() + for wildcard in filter(bool, ignores.split('\n')): + for filename in glob.glob(wildcard): + try: + os.remove(filename) + except OSError: + shutil.rmtree(filename, ignore_errors=True) + + # It's an old-style class in Python 2.7... + distutils.command.clean.clean.run(self) + + ea = [] if sys.platform in ('darwin', 'linux'): # Silence unused stuff warnings @@ -120,9 +138,12 @@ def __init__(self, *args, **kwargs): define_macros=[('GPUARRAY_SHARED', None)] )] +cmds=versioneer.get_cmdclass() +cmds["clean"] = cmd_clean + setup(name='pygpu', version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), + cmdclass=cmds, description='numpy-like wrapper on libgpuarray for GPU computations', packages=['pygpu', 'pygpu/tests'], include_package_data=True, diff --git a/versioneer.py b/versioneer.py index 3d529153d7..e36c724a1d 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1646,26 +1646,6 @@ def make_release_tree(self, base_dir, files): write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist - - import distutils.command.clean - import shutil - - class cmd_clean(distutils.command.clean.clean): - def run(self): - import glob - with open('.clean', 'r') as f: - ignores = f.read() - for wildcard in filter(bool, ignores.split('\n')): - for filename in glob.glob(wildcard): - try: - os.remove(filename) - except OSError: - shutil.rmtree(filename, ignore_errors=True) - - # It's an old-style class in Python 2.7... - distutils.command.clean.clean.run(self) - cmds["clean"] = cmd_clean - return cmds From 921b9311ebe61d1755d91d6ec4ba2628460ee73e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 14 Sep 2017 18:34:48 -0400 Subject: [PATCH 551/597] Make sure to record at the end and free the buffer on error. --- src/gpuarray_blas_cuda_cublas.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index fb46d009ab..a8a95dd1fc 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -835,7 +835,11 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return ctx->err->code; } - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_READ)); + if (cuda_wait(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { + gpudata_release(Ta); + cuda_exit(ctx); + return ctx->err->code; + } err = cublasSgemmBatched(h->h, convT(transA), convT(transB), @@ -843,6 +847,11 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, (const float **)Aa, lda, (const float **)Ba, ldb, &beta, (float **)Ca, ldc, batchCount); + if (cuda_record(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { + gpudata_release(Ta); + cuda_exit(ctx); + return ctx->err->code; + } gpudata_release(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); @@ -964,7 +973,11 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, return ctx->err->code; } - GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Ta, CUDA_WAIT_READ)); + if (cuda_wait(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { + gpudata_release(Ta); + cuda_exit(ctx); + return ctx->err->code; + } err = cublasDgemmBatched(h->h, convT(transA), convT(transB), @@ -972,7 +985,14 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); + + if (cuda_record(Ta, CUDA_WAIT_READ) != GA_NO_ERROR) { + gpudata_release(Ta); + cuda_exit(ctx); + return ctx->err->code; + } gpudata_release(Ta); + if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); return error_cublas(ctx->err, "cublasDgemmBatched", err); From 957301f723972752c29665a7a28f4054bbdae71d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 14 Sep 2017 18:38:58 -0400 Subject: [PATCH 552/597] Fix the wait/record dance when merging with the next buffer. --- src/gpuarray_buffer_cuda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 99b9bbee31..37db905d23 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -851,8 +851,8 @@ static void cuda_free(gpudata *d) { d->ptr + d->sz == next->ptr) { d->sz = d->sz + next->sz; d->next = next->next; - cuda_wait(next, CUDA_WAIT_ALL); - cuda_record(d, CUDA_WAIT_ALL); + cuda_waits(next, CUDA_WAIT_ALL, d->ls); + cuda_records(d, CUDA_WAIT_ALL, d->ls); deallocate(next); } else { d->next = next; From 2e5ff035073ab73f3d8e33b1f1ce511b43c0e0a1 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 17 Sep 2017 12:55:29 -0400 Subject: [PATCH 553/597] opencl: Query device version string length before querying device version If the provided size is < the actual string size clGetDeviceInfo() returns CL_INVALID_VALUE (see OpenCL 1.2 ch. 4.2) Signed-off-by: Jan Vesely --- src/gpuarray_buffer_opencl.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 6278e39f40..8f128115ee 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -113,7 +113,8 @@ cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { cl_command_queue_properties qprop; char vendor[32]; char driver_version[64]; - char device_version[32]; + char *device_version = NULL; + size_t device_version_size = 0; cl_uint vendor_id; cl_int err; size_t len; @@ -132,9 +133,19 @@ cl_ctx *cl_make_ctx(cl_context ctx, gpucontext_props *p) { id = get_dev(ctx, global_err); if (id == NULL) return NULL; + /* Query device version string size */ CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VERSION, - sizeof(device_version), - &device_version, NULL)); + 0, NULL, &device_version_size)); + if (device_version_size > 1024) { + error_set(global_err, GA_UNSUPPORTED_ERROR, + "device version buffer too large"); + return NULL; + } + + device_version = alloca(device_version_size); + CL_CHECKN(global_err, clGetDeviceInfo(id, CL_DEVICE_VERSION, + device_version_size, + device_version, NULL)); if (device_version[7] == '1' && device_version[9] < '2') { error_set(global_err, GA_UNSUPPORTED_ERROR, "We only support OpenCL 1.2 and up"); From a5c260bf5aa3f226db0da361fad1910d75e9e683 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 17 Sep 2017 14:49:58 -0400 Subject: [PATCH 554/597] check_elemwise: Fix array size. Fixes failure when trying to write 12B to 8B buffer Signed-off-by: Jan Vesely --- tests/check_elemwise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index 9656bf870a..9695f7b30f 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -371,7 +371,7 @@ START_TEST(test_basic_scalar) { GpuElemwise *ge; static const uint32_t data1[3] = {1, 2, 3}; - static const uint32_t data2[3] = {4, 5}; + static const uint32_t data2[2] = {4, 5}; uint32_t data3[6] = {0}; size_t dims[2]; From 9776e8e68d01b75dd5e7623cdfbe90b2675a502c Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 18 Sep 2017 09:27:21 -0400 Subject: [PATCH 555/597] Force a complete clear of old installation. --- doc/installation.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/installation.rst b/doc/installation.rst index 9fc81c63da..17e189987d 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -123,6 +123,7 @@ computer, you can install them in your home directory like this: :: cd + rm -rf ~/.local/lib/libgpuarray* ~/.local/include/gpuarray rm -rf build Build mkdir Build cd Build From df0fb2bc23640bd904802e0b0279dd60e728a58c Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Sun, 17 Sep 2017 23:08:36 -0400 Subject: [PATCH 556/597] cluda_opencl: cl_khr_fp64 is optional extension/feature Signed-off-by: Jan Vesely --- src/cluda_opencl.h | 5 + src/cluda_opencl.h.c | 877 ++++++++++++++++++++++--------------------- 2 files changed, 448 insertions(+), 434 deletions(-) diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 18d65528d7..f48bbc7bf4 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -33,7 +33,10 @@ #define ga_long long #define ga_ulong ulong #define ga_float float +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64: enable #define ga_double double +#endif #define ga_size ulong #define ga_ssize long #define GA_DECL_SHARED_PARAM(type, name) , __local type *name @@ -125,10 +128,12 @@ gen_atom32_add(atom_add_fl, ga_float, local) #define atom_xchg_fg(a, b) atomic_xchg(a, b) #define atom_xchg_fl(a, b) atomic_xchg(a, b) /* ga_double */ +#ifdef cl_khr_fp64 gen_atom64_add(atom_add_dg, ga_double, global) gen_atom64_add(atom_add_dl, ga_double, local) gen_atom64_xchg(atom_xchg_dg, ga_double, global) gen_atom64_xchg(atom_xchg_dl, ga_double, local) +#endif /* ga_half */ #define gen_atomh_add(name, aspace) \ ga_half name(volatile aspace ga_half *addr, ga_half val); \ diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index 54d2b4eff4..02d3c87485 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -80,671 +80,680 @@ static const char cluda_opencl_h[] = { 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, -0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x20, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x69, -0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x73, 0x73, -0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, -0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x50, -0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, -0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x2c, 0x20, 0x5f, 0x5f, 0x6c, -0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, -0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, -0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, 0x42, 0x4f, 0x44, 0x59, 0x28, -0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, -0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, -0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, -0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x74, 0x79, 0x70, 0x65, 0x64, 0x65, -0x66, 0x20, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x20, 0x5f, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, -0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x3b, 0x0a, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, 0x6f, 0x61, 0x64, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, 0x20, 0x26, 0x28, 0x28, 0x70, -0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x0a, 0x73, 0x74, -0x61, 0x74, 0x69, 0x63, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, -0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, -0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, -0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x76, 0x73, 0x74, -0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x5f, 0x72, 0x74, -0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x72, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, -0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, -0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, 0x45, -0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, 0x49, -0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x69, -0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, 0x61, -0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, -0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, -0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, -0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, -0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, +0x66, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x66, 0x70, +0x36, 0x34, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, +0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x20, 0x45, 0x58, 0x54, 0x45, +0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, +0x72, 0x5f, 0x66, 0x70, 0x36, 0x34, 0x3a, 0x20, 0x65, 0x6e, 0x61, +0x62, 0x6c, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x0a, 0x23, 0x65, 0x6e, 0x64, +0x69, 0x66, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x75, 0x6c, 0x6f, +0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x73, 0x73, 0x69, 0x7a, 0x65, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x47, 0x41, 0x5f, 0x44, 0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, +0x52, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x28, 0x74, +0x79, 0x70, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, +0x2c, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x20, 0x74, +0x79, 0x70, 0x65, 0x20, 0x2a, 0x6e, 0x61, 0x6d, 0x65, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x44, +0x45, 0x43, 0x4c, 0x5f, 0x53, 0x48, 0x41, 0x52, 0x45, 0x44, 0x5f, +0x42, 0x4f, 0x44, 0x59, 0x28, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, +0x53, 0x49, 0x5a, 0x45, 0x20, 0x5f, 0x5f, 0x47, 0x41, 0x5f, 0x57, +0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x0a, 0x0a, 0x74, +0x79, 0x70, 0x65, 0x64, 0x65, 0x66, 0x20, 0x73, 0x74, 0x72, 0x75, +0x63, 0x74, 0x20, 0x5f, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x7b, 0x0a, 0x20, 0x20, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x64, +0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x3b, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x70, 0x29, 0x20, 0x76, 0x6c, +0x6f, 0x61, 0x64, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x30, 0x2c, +0x20, 0x26, 0x28, 0x28, 0x70, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, +0x29, 0x29, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x69, +0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, +0x20, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, +0x2c, 0x20, 0x26, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, +0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, +0x61, 0x20, 0x4f, 0x50, 0x45, 0x4e, 0x43, 0x4c, 0x5f, 0x45, 0x58, +0x54, 0x45, 0x4e, 0x53, 0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, +0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, +0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, +0x3a, 0x20, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x0a, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, +0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, +0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, +0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, +0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, -0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, -0x6e, 0x74, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, -0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x61, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, +0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, +0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, -0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, -0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, +0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, -0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, -0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, -0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, +0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, +0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, +0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, +0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, -0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, +0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, -0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, +0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, -0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, -0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, -0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, +0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, -0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, +0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, +0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x3b, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x3d, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, +0x2e, 0x61, 0x20, 0x3d, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, +0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, +0x3d, 0x20, 0x70, 0x2e, 0x61, 0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x70, 0x2e, 0x61, -0x20, 0x2b, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, +0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, +0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, +0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, +0x21, 0x3d, 0x20, 0x61, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, -0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x2c, -0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, -0x20, 0x28, 0x70, 0x2e, 0x77, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, -0x6e, 0x20, 0x6e, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, +0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, +0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, +0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, -0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, -0x70, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x2a, 0x2c, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, -0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x61, 0x72, 0x67, 0x74, -0x79, 0x70, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x76, 0x61, 0x6c, -0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x72, +0x67, 0x74, 0x79, 0x70, 0x65, 0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x61, 0x72, 0x67, 0x74, 0x79, 0x70, 0x65, -0x20, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x77, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, +0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x70, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, +0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x61, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, +0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, +0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, +0x77, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, +0x72, 0x6e, 0x20, 0x70, 0x2e, 0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x70, 0x2e, 0x77, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x28, 0x76, 0x6f, 0x6c, -0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, -0x65, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x6e, 0x2e, 0x77, 0x29, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x2e, -0x61, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, -0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, +0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x69, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, +0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, +0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x66, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, -0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, -0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x67, 0x6c, 0x6f, -0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x2c, 0x20, -0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, 0x6c, -0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, -0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, -0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, -0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, -0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, +0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x66, +0x70, 0x36, 0x34, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, +0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, +0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, +0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, -0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, -0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, -0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, -0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, +0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, +0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, +0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, -0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, -0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, +0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, +0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, +0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, +0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, +0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, +0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, +0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, +0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, +0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, +0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, +0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, -0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, -0x3d, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, +0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, -0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, +0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, -0x5d, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, -0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, -0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, -0x61, 0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, -0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, -0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, -0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, -0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, +0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20, 0x67, +0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, +0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x66, 0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, -0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, -0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, +0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, +0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, +0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, +0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, -0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, +0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, +0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, -0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, -0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, -0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, -0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, -0x74, 0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, +0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, +0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, -0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, -0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, +0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, +0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, +0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, +0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, +0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, -0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, -0x61, 0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, +0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, +0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x64, 0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, +0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, -0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, +0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, -0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, +0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, -0x5d, 0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, +0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x69, 0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, -0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, -0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, +0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, +0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, +0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, -0x69, 0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, -0x20, 0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, +0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, +0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, -0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f, 0x2e, +0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, -0x0a, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, +0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, +0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, +0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, -0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, -0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, -0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, -0x0a, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; +0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, +0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, +0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, +0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, +0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x0a, +0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; From 4285e201a9a04222db6f3b5d72cffa3fd9f9a029 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 18 Sep 2017 16:12:25 -0400 Subject: [PATCH 557/597] cluda_opencl: cl_khr_int64_base_atomics is optional extension Signed-off-by: Jan Vesely --- src/cluda_opencl.h | 14 +- src/cluda_opencl.h.c | 445 ++++++++++++++++++++++--------------------- 2 files changed, 236 insertions(+), 223 deletions(-) diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index f48bbc7bf4..6e0095c87f 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -112,6 +112,14 @@ static inline ga_half ga_float2half(ga_float f) { #define atom_add_Il(a, b) atomic_add(a, b) #define atom_xchg_Ig(a, b) atomic_xchg(a, b) #define atom_xchg_Il(a, b) atomic_xchg(a, b) +/* ga_float */ +gen_atom32_add(atom_add_fg, ga_float, global) +gen_atom32_add(atom_add_fl, ga_float, local) +#define atom_xchg_fg(a, b) atomic_xchg(a, b) +#define atom_xchg_fl(a, b) atomic_xchg(a, b) + +#ifdef cl_khr_int64_base_atomics +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable /* ga_long */ #define atom_add_lg(a, b) atom_add(a, b) #define atom_add_ll(a, b) atom_add(a, b) @@ -122,11 +130,6 @@ static inline ga_half ga_float2half(ga_float f) { #define atom_add_Ll(a, b) atom_add(a, b) #define atom_xchg_Lg(a, b) atom_xchg(a, b) #define atom_xchg_Ll(a, b) atom_xchg(a, b) -/* ga_float */ -gen_atom32_add(atom_add_fg, ga_float, global) -gen_atom32_add(atom_add_fl, ga_float, local) -#define atom_xchg_fg(a, b) atomic_xchg(a, b) -#define atom_xchg_fl(a, b) atomic_xchg(a, b) /* ga_double */ #ifdef cl_khr_fp64 gen_atom64_add(atom_add_dg, ga_double, global) @@ -134,6 +137,7 @@ gen_atom64_add(atom_add_dl, ga_double, local) gen_atom64_xchg(atom_xchg_dg, ga_double, global) gen_atom64_xchg(atom_xchg_dl, ga_double, local) #endif +#endif /* ga_half */ #define gen_atomh_add(name, aspace) \ ga_half name(volatile aspace ga_half *addr, ga_half val); \ diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index 02d3c87485..ced789f979 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -412,348 +412,357 @@ static const char cluda_opencl_h[] = { 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x2c, +0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2c, 0x20, +0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, +0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, +0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f, +0x6b, 0x68, 0x72, 0x5f, 0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, +0x61, 0x73, 0x65, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, +0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, 0x61, 0x20, 0x4f, 0x50, +0x45, 0x4e, 0x43, 0x4c, 0x20, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x53, +0x49, 0x4f, 0x4e, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, +0x69, 0x6e, 0x74, 0x36, 0x34, 0x5f, 0x62, 0x61, 0x73, 0x65, 0x5f, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x73, 0x3a, 0x20, 0x65, 0x6e, +0x61, 0x62, 0x6c, 0x65, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, -0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, +0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, +0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, +0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, -0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, -0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x67, 0x65, 0x6e, -0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, 0x64, 0x64, -0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, -0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, -0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x33, 0x32, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x66, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, -0x61, 0x74, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x64, -0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f, 0x6b, 0x68, 0x72, 0x5f, 0x66, -0x70, 0x36, 0x34, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, -0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, -0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, 0x64, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, -0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, -0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, -0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, +0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x69, 0x66, 0x64, 0x65, 0x66, 0x20, 0x63, 0x6c, 0x5f, +0x6b, 0x68, 0x72, 0x5f, 0x66, 0x70, 0x36, 0x34, 0x0a, 0x67, 0x65, +0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x61, 0x64, +0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, +0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, +0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, +0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x6c, 0x2c, 0x20, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x2c, 0x20, -0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, -0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, -0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, -0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, -0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, -0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, -0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, -0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x64, -0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, -0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, -0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, -0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, -0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, -0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x36, 0x34, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x64, 0x6c, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, +0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, +0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x23, 0x65, 0x6e, +0x64, 0x69, 0x66, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x6e, 0x61, 0x6d, 0x65, +0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, +0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, +0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, +0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, +0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, +0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x69, +0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, -0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, +0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x76, +0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, +0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, +0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, +0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, +0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, +0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, -0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, 0x6e, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, 0x20, +0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x6f, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, -0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, -0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x3d, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, +0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, +0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, +0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, -0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, +0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x20, 0x3d, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, +0x61, 0x74, 0x28, 0x6f, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, +0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, -0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20, 0x67, -0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, 0x6c, -0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x66, 0x6f, -0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, -0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, 0x6d, -0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, -0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, -0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, 0x29, -0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, 0x3d, 0x20, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, +0x6c, 0x66, 0x28, 0x66, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x66, +0x6f, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x2e, 0x69, +0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x5f, 0x63, +0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, 0x73, 0x65, +0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, -0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, -0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, +0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, 0x2e, 0x69, +0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x68, 0x5b, +0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, 0x65, -0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, -0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, 0x61, -0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, -0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, -0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x65, 0x6e, 0x5f, 0x61, +0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x6e, +0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, +0x29, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, -0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, -0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, -0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, -0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, -0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, -0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, +0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, +0x61, 0x63, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, -0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, -0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, 0x61, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x28, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, 0x65, -0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, +0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, +0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, +0x66, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, +0x20, 0x69, 0x64, 0x78, 0x20, 0x3d, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, -0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, +0x26, 0x20, 0x32, 0x29, 0x20, 0x3e, 0x3e, 0x20, 0x31, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x76, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, +0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, +0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x76, 0x6f, 0x6c, +0x61, 0x74, 0x69, 0x6c, 0x65, 0x20, 0x61, 0x73, 0x70, 0x61, 0x63, +0x65, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, +0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, +0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x75, 0x6e, 0x69, 0x6f, 0x6e, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, 0x5b, 0x32, 0x5d, +0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, -0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, 0x2c, -0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x6f, 0x2c, 0x20, 0x61, +0x2c, 0x20, 0x6e, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, -0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, -0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, +0x73, 0x65, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, -0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x64, +0x6f, 0x20, 0x7b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, 0x69, -0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x2e, +0x69, 0x20, 0x3d, 0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, 0x20, -0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x2e, 0x69, 0x20, 0x3d, +0x20, 0x6f, 0x2e, 0x69, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x20, -0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x6e, 0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, +0x20, 0x3d, 0x20, 0x76, 0x61, 0x6c, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, -0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, 0x61, -0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, -0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, -0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, 0x61, +0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x6f, 0x2e, 0x69, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, +0x63, 0x5f, 0x63, 0x6d, 0x70, 0x78, 0x63, 0x68, 0x67, 0x28, 0x62, +0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x2e, 0x69, 0x2c, 0x20, 0x6e, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x5c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, +0x6c, 0x65, 0x20, 0x28, 0x6f, 0x2e, 0x69, 0x20, 0x21, 0x3d, 0x20, +0x61, 0x2e, 0x69, 0x29, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f, 0x2e, -0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f, +0x2e, 0x68, 0x5b, 0x69, 0x64, 0x78, 0x5d, 0x3b, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, -0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, -0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x61, -0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, -0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, -0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, +0x20, 0x20, 0x20, 0x20, 0x20, 0x5c, 0x0a, 0x20, 0x20, 0x7d, 0x0a, +0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, +0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, 0x6d, -0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, 0x20, 0x67, -0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, -0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, -0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, -0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, 0x0a, -0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; +0x68, 0x5f, 0x61, 0x64, 0x64, 0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, +0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, 0x5f, 0x61, 0x74, 0x6f, +0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x28, 0x61, 0x74, 0x6f, +0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x2c, 0x20, +0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x29, 0x0a, 0x67, 0x65, 0x6e, +0x5f, 0x61, 0x74, 0x6f, 0x6d, 0x68, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x28, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x65, 0x6c, 0x2c, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x29, 0x0a, +0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; From 99545026f00e8697bfa5b8fc326f1a808bbe659c Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 18 Sep 2017 18:35:15 -0400 Subject: [PATCH 558/597] pygpu: Don't change addres space when casting pointer. Fixes pygpu.tests.test_basic.test_triu and pygpu.tests.test_basic.test_tril on amdgpu/clover Signed-off-by: Jan Vesely --- pygpu/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygpu/basic.py b/pygpu/basic.py index 51b92c5cc4..e054daade0 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -6,7 +6,7 @@ def _generate_kernel(ctx, cols, upper=True): tmpl = Template(""" #include "cluda.h" KERNEL void extract_tri(GLOBAL_MEM ga_float *a, ga_size a_off, ga_uint N) { - a = (GLOBAL_MEM ga_float *)(((char *)a) + a_off); + a = (GLOBAL_MEM ga_float *)(((GLOBAL_MEM char *)a) + a_off); unsigned int idx = GID_1 * LDIM_0 * GDIM_0 + GID_0 * LDIM_0 + LID_0; unsigned int ix = idx/${cols}; From ff234cf9f34ac9768e7bd81fa31c76e8a5f6488c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 20 Sep 2017 14:07:11 -0400 Subject: [PATCH 559/597] Use the right encrypted token for appveyor. --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 895cbb4a40..839ae1a1ff 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -15,7 +15,7 @@ configuration: environment: BINSTAR_TOKEN: - secure: 4KDgW9K3omzJ/ILWm1ApjsvTjefHJuECRy7nmnvuUul+0gbAYTUm5JanY+X1pccu + secure: 58KqJcKtfCBVCuIzpnkLm4XZLQqKq95Hs8Ly20HWaMSla67nusrp3y4sy6XzZOBQ matrix: - CONDA_LOC: "C:\\Miniconda-x64" From 70a2d1245da6b62927d5b1aa46579977bc5898e2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 27 Sep 2017 12:14:00 -0400 Subject: [PATCH 560/597] Complain when the version could not be determined instead of building with 0+unknown. --- setup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b185e0aef5..3de2bed819 100755 --- a/setup.py +++ b/setup.py @@ -138,11 +138,16 @@ def run(self): define_macros=[('GPUARRAY_SHARED', None)] )] -cmds=versioneer.get_cmdclass() +cmds = versioneer.get_cmdclass() cmds["clean"] = cmd_clean +version_data = versioneer.get_versions() + +if version_data['error'] is not None: + raise ValueError("Can't determine version for build: %s\n Please make sure that your git checkout includes tags." % (version_data['error'],)) + setup(name='pygpu', - version=versioneer.get_version(), + version=version_data['version'], cmdclass=cmds, description='numpy-like wrapper on libgpuarray for GPU computations', packages=['pygpu', 'pygpu/tests'], From 725255160d41eb5555b933839ca9510498e402be Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 29 Sep 2017 11:46:00 -0400 Subject: [PATCH 561/597] Add support for tensor cores in float16. --- src/gpuarray_blas_cuda_cublas.c | 36 +++++++++++++++++++++++---------- src/loaders/libcublas.fn | 2 ++ src/loaders/libcublas.h | 28 +++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index a8a95dd1fc..fd9d1f99a6 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -443,7 +443,7 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(B); ASSERT_BUF(C); - if (cublasSgemmEx == NULL) + if (cublasGemmEx == NULL && cublasSgemmEx == NULL) return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx unavailable"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || @@ -476,16 +476,30 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); - CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemmEx(h->h, convT(transA), convT(transB), - M, N, K, - &alpha, ((uint16_t *)A->ptr) + offA, - CUDA_R_16F, - lda, ((uint16_t *)B->ptr) + offB, - CUDA_R_16F, - ldb, &beta, ((uint16_t *)C->ptr) + offC, - CUDA_R_16F, - ldc)); - + if (cublasGemmEx) { + CUBLAS_EXIT_ON_ERROR(ctx, cublasGemmEx(h->h, convT(transA), convT(transB), + M, N, K, + &alpha, ((uint16_t *)A->ptr) + offA, + CUDA_R_16F, + lda, ((uint16_t *)B->ptr) + offB, + CUDA_R_16F, + ldb, &beta, ((uint16_t *)C->ptr) + offC, + CUDA_R_16F, + ldc, + CUDA_R_32F, + CUBLAS_GEMM_DFALT_TENSOR_OP)); + } else { + CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemmEx(h->h, convT(transA), convT(transB), + M, N, K, + &alpha, ((uint16_t *)A->ptr) + offA, + CUDA_R_16F, + lda, ((uint16_t *)B->ptr) + offB, + CUDA_R_16F, + ldb, &beta, ((uint16_t *)C->ptr) + offC, + CUDA_R_16F, + ldc)); + } + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn index c181c9f9b1..c97bc52386 100644 --- a/src/loaders/libcublas.fn +++ b/src/loaders/libcublas.fn @@ -21,6 +21,8 @@ DEF_PROC_V2(cublasDger, (cublasHandle_t handle, int m, int n, const double *alph DEF_PROC_OPT(cublasSgemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const float *beta, void *C, cudaDataType Ctype, int ldc)); +DEF_PROC_OPT(cublasGemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType, cublasGemmAlgo_t algo)); + DEF_PROC(cublasSgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, const float *beta, float *Carray[], int ldc, int batchCount)); DEF_PROC(cublasDgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, const double *beta, double *Carray[], int ldc, int batchCount)); diff --git a/src/loaders/libcublas.h b/src/loaders/libcublas.h index f6f46963dc..4ebf4e44f8 100644 --- a/src/loaders/libcublas.h +++ b/src/loaders/libcublas.h @@ -34,6 +34,34 @@ typedef enum cudaDataType_t CUDA_C_32U= 13 // complex as a pair of unsigned int numbers } cudaDataType; +typedef cudaDataType cudaDataType_t; + +typedef enum { + CUBLAS_GEMM_DFALT = -1, + CUBLAS_GEMM_ALGO0 = 0, + CUBLAS_GEMM_ALGO1 = 1, + CUBLAS_GEMM_ALGO2 = 2, + CUBLAS_GEMM_ALGO3 = 3, + CUBLAS_GEMM_ALGO4 = 4, + CUBLAS_GEMM_ALGO5 = 5, + CUBLAS_GEMM_ALGO6 = 6, + CUBLAS_GEMM_ALGO7 = 7, + CUBLAS_GEMM_ALGO8 = 8, + CUBLAS_GEMM_ALGO9 = 9, + CUBLAS_GEMM_ALGO10 = 10, + CUBLAS_GEMM_ALGO11 = 11, + CUBLAS_GEMM_ALGO12 = 12, + CUBLAS_GEMM_ALGO13 = 13, + CUBLAS_GEMM_ALGO14 = 14, + CUBLAS_GEMM_ALGO15 = 15, + CUBLAS_GEMM_ALGO16 = 16, + CUBLAS_GEMM_ALGO17 = 17, + CUBLAS_GEMM_DFALT_TENSOR_OP = 99, + CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, + CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, + CUBLAS_GEMM_ALGO2_TENSOR_OP = 102 +} cublasGemmAlgo_t; + typedef struct CUstream_st *cudaStream_t; typedef enum { From c193b874e09636903f801e0fe4b1ff0c18ea3480 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 26 Sep 2017 15:32:45 -0400 Subject: [PATCH 562/597] Don't attempt to use tensor core on CUDA < 9.0. --- src/gpuarray_blas_cuda_cublas.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index fd9d1f99a6..673cf0eb75 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -72,6 +72,7 @@ typedef struct _blas_handle { GpuKernel dgemvBH_T_a1_b1_small; GpuKernel sgerBH_gen_small; GpuKernel dgerBH_gen_small; + uint8_t tensorCore; } blas_handle; #define LARGE_VAL(v) (v >= INT_MAX) @@ -210,6 +211,13 @@ static int setup(gpucontext *c) { if (handle == NULL) return error_sys(ctx->err, "calloc"); + /* Only try to use tensor core on cuda 9 and up */ + if (ctx->major >= 9) { + handle->tensorCore = 1; + } else { + handle->tensorCore = 0; + } + cuda_enter(ctx); err = cublasCreate(&handle->h); if (err != CUBLAS_STATUS_SUCCESS) { @@ -443,8 +451,8 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, ASSERT_BUF(B); ASSERT_BUF(C); - if (cublasGemmEx == NULL && cublasSgemmEx == NULL) - return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx unavailable"); + if (cublasSgemmEx == NULL && (cublasGemmEx == NULL || h->tensorCore == 0)) + return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx|cublasGemmEx unavailable"); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || @@ -476,7 +484,7 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); - if (cublasGemmEx) { + if (cublasGemmEx != NULL && h->tensorCore) { CUBLAS_EXIT_ON_ERROR(ctx, cublasGemmEx(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((uint16_t *)A->ptr) + offA, From cb1219abb19d25f0c9a0e90b988133cd119f6e5d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 29 Sep 2017 17:48:43 -0400 Subject: [PATCH 563/597] Actually check for the compute capability of the current device and only ask for tensor core when it is available. --- src/gpuarray_blas_cuda_cublas.c | 18 +++++++++++++++--- src/gpuarray_buffer_cuda.c | 2 +- src/private_cuda.h | 2 ++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 673cf0eb75..ba4ac2b37a 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -200,8 +200,10 @@ static const char *code_dgerBH_gen_small = \ static int setup(gpucontext *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle; + CUdevice dev; cublasStatus_t err; int types[10]; + int major, minor; int e; if (ctx->blas_handle != NULL) @@ -211,14 +213,24 @@ static int setup(gpucontext *c) { if (handle == NULL) return error_sys(ctx->err, "calloc"); + cuda_enter(ctx); + { + CUresult err; + err = cuCtxGetDevice(&dev); + if (err != CUDA_SUCCESS) { + cuda_exit(ctx); + return error_cuda(ctx->err, "cuCtxGetDevice", err); + } + } + GA_CUDA_EXIT_ON_ERROR(ctx, get_cc(dev, &major, &minor, ctx->err)); + /* Only try to use tensor core on cuda 9 and up */ - if (ctx->major >= 9) { + if (ctx->major >= 9 && major >= 7 && minor >= 0) { handle->tensorCore = 1; } else { handle->tensorCore = 0; } - cuda_enter(ctx); err = cublasCreate(&handle->h); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); @@ -507,7 +519,7 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, CUDA_R_16F, ldc)); } - + GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 37db905d23..26aea6de25 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1048,7 +1048,7 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) { return GA_NO_ERROR; } -static int get_cc(CUdevice dev, int *maj, int *min, error *e) { +int get_cc(CUdevice dev, int *maj, int *min, error *e) { CUresult err; err = cuDeviceGetAttribute(maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, diff --git a/src/private_cuda.h b/src/private_cuda.h index f60b961d37..3e428e79f2 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -157,4 +157,6 @@ struct _gpukernel { #endif }; +int get_cc(CUdevice dev, int *maj, int *min, error *e); + #endif From 90f7f60c07e751a3da06c29f50da47cae74837be Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 2 Oct 2017 17:25:28 -0400 Subject: [PATCH 564/597] Raise an error when the compute capability of the GPU is too low for the current cuda version. --- src/gpuarray_buffer_cuda.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 26aea6de25..34aeb512aa 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -219,13 +219,31 @@ cuda_context *cuda_make_ctx(CUcontext ctx, gpucontext_props *p) { cache *mem_cache; const char *cache_path; void *pp; + CUdevice dev; CUresult err; + int cc_major, cc_minor; int e; e = setup_lib(); if (e != GA_NO_ERROR) return NULL; + err = cuCtxGetDevice(&dev); + if (err != CUDA_SUCCESS) { + error_cuda(global_err, "cuCtxGetDevice", err); + return NULL; + } + + e = get_cc(dev, &cc_major, &cc_minor, global_err); + if (e != GA_NO_ERROR) + return NULL; + + if ((major >= 9 && cc_major <= 2) || (major >= 7 && cc_major <= 1)) { + error_set(global_err, GA_UNSUPPORTED_ERROR, + "GPU is too old for CUDA version"); + return NULL; + } + res = calloc(1, sizeof(*res)); if (res == NULL) { error_sys(global_err, "calloc"); From 7648e6a4396d723d6070d344b93c479e26bfb99b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 4 Oct 2017 16:15:03 -0400 Subject: [PATCH 565/597] Fix wrong index in the generation code for scalars. --- src/gpuarray_elemwise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index fa5b0efa2c..776b386fcd 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -177,7 +177,7 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, ktypes[p++] = GA_SSIZE; } } else { - strb_appendf(&sb, "%s %s", ctype(args[i].typecode), args[j].name); + strb_appendf(&sb, "%s %s", ctype(args[j].typecode), args[j].name); ktypes[p++] = args[j].typecode; } if (j != (n - 1)) strb_appends(&sb, ", "); From a2d0dd9f4942f7f8fb4da84e6a6821c1594b52d2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 5 Oct 2017 12:35:44 -0400 Subject: [PATCH 566/597] Add test for the scalar dtype error. --- tests/check_elemwise.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index 9695f7b30f..df6fe36af7 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -434,6 +434,60 @@ START_TEST(test_basic_scalar) { } END_TEST +START_TEST(test_basic_scalar_dtype) { + GpuArray x; + GpuArray y; + float a = 1.1f; + + GpuElemwise *ge; + + static const int32_t data1[4] = {0, 1, 2, 3}; + static const float data2[4] = {2.0, 2.0, 2.0, 2.0}; + float data3[4]; + + size_t dims[2] = {2, 2}; + + gpuelemwise_arg args[3] = {{0}}; + void *rargs[3]; + + ga_assert_ok(GpuArray_empty(&x, ctx, GA_UINT, 2, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_write(&x, data1, sizeof(data1))); + + ga_assert_ok(GpuArray_empty(&y, ctx, GA_FLOAT, 2, dims, GA_F_ORDER)); + ga_assert_ok(GpuArray_write(&y, data2, sizeof(data2))); + + args[0].name = "a"; + args[0].typecode = GA_FLOAT; + args[0].flags = GE_SCALAR; + + args[1].name = "x"; + args[1].typecode = GA_INT; + args[1].flags = GE_READ; + + args[2].name = "y"; + args[2].typecode = GA_FLOAT; + args[2].flags = GE_READ|GE_WRITE; + + ge = GpuElemwise_new(ctx, "", "y = a * x + y", 3, args, 2, 0); + + ck_assert_ptr_ne(ge, NULL); + + rargs[0] = &a; + rargs[1] = &x; + rargs[2] = &y; + + ga_assert_ok(GpuElemwise_call(ge, rargs, 0)); + + ga_assert_ok(GpuArray_read(data3, sizeof(data3), &y)); + + ck_assert_float_eq(data3[0], 2.0f); + ck_assert_float_eq(data3[1], 4.2f); + + ck_assert_float_eq(data3[2], 3.1f); + ck_assert_float_eq(data3[3], 5.3f); +} +END_TEST + START_TEST(test_basic_remove1) { GpuArray a; GpuArray b; @@ -820,6 +874,7 @@ Suite *get_suite(void) { tcase_add_test(tc, test_basic_simple); tcase_add_test(tc, test_basic_f16); tcase_add_test(tc, test_basic_scalar); + tcase_add_test(tc, test_basic_scalar_dtype); tcase_add_test(tc, test_basic_offset); tcase_add_test(tc, test_basic_remove1); tcase_add_test(tc, test_basic_broadcast); From d3611f3b66352915fc01c81c235f3da8b81cff41 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 5 Oct 2017 12:59:24 -0400 Subject: [PATCH 567/597] Add compat for libcheck < 0.11. --- tests/check_elemwise.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index df6fe36af7..d615df43e0 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -6,6 +6,26 @@ #include "gpuarray/error.h" #include "gpuarray/types.h" +#if CHECK_MINOR_VERSION < 11 + +#ifndef CK_FLOATING_DIG +# define CK_FLOATING_DIG 6 +#endif /* CK_FLOATING_DIG */ + +#define _ck_assert_floating(X, OP, Y, TP, TM) do { \ + TP _ck_x = (X); \ + TP _ck_y = (Y); \ + ck_assert_msg(_ck_x OP _ck_y, \ + "Assertion '%s' failed: %s == %.*"TM"g, %s == %.*"TM"g", \ + #X" "#OP" "#Y, \ + #X, (int)CK_FLOATING_DIG, _ck_x, \ + #Y, (int)CK_FLOATING_DIG, _ck_y); \ + } while (0) + +#define ck_assert_float_eq(X, Y) _ck_assert_floating(X, ==, Y, float, "") +#endif + + extern void *ctx; void setup(void); From c08f33c1e7eb1ef745a86c7ea123c8cc8a5516d0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 5 Oct 2017 13:39:45 -0400 Subject: [PATCH 568/597] Fix x dtype. --- tests/check_elemwise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/check_elemwise.c b/tests/check_elemwise.c index d615df43e0..c65142514e 100644 --- a/tests/check_elemwise.c +++ b/tests/check_elemwise.c @@ -470,7 +470,7 @@ START_TEST(test_basic_scalar_dtype) { gpuelemwise_arg args[3] = {{0}}; void *rargs[3]; - ga_assert_ok(GpuArray_empty(&x, ctx, GA_UINT, 2, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&x, ctx, GA_INT, 2, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&x, data1, sizeof(data1))); ga_assert_ok(GpuArray_empty(&y, ctx, GA_FLOAT, 2, dims, GA_F_ORDER)); From f6672467008b159190dc83399e0be3a3b222a28a Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 10 Oct 2017 14:59:04 -0400 Subject: [PATCH 569/597] Change base build image to accomodate for old libc. --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 46fd53b800..0e9ae53451 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,7 +3,7 @@ version: 2 jobs: build_pkgs: docker: - - image: milaudem/libgpuarray:0 + - image: joaander/conda-build:20170905 steps: - checkout From 39d37bdfda792324ce09feb090f6c80fdd443369 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 10 Oct 2017 15:42:24 -0400 Subject: [PATCH 570/597] Adjust path of artifacts. --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0e9ae53451..09f05ff1ca 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -27,11 +27,11 @@ jobs: command: | if [[ -n "${CIRCLE_TAG}" ]] then - anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/libgpuarray* - anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /miniconda/conda-bld/linux-64/pygpu* + anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /opt/conda/conda-bld/linux-64/libgpuarray* + anaconda -t $BINSTAR_TOKEN upload --user=mila-udem /opt/conda/conda-bld/linux-64/pygpu* fi - store_artifacts: - path: /miniconda/conda-bld/linux-64 + path: /opt/conda/conda-bld/linux-64 workflows: version: 2 From ed3200fdd048ccae7bd011da10aa0fe037dbd81f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 10 Oct 2017 16:12:32 -0400 Subject: [PATCH 571/597] Try to only patch VS2008 when required. --- .appveyor.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 839ae1a1ff..6f87c22b13 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -19,8 +19,11 @@ environment: matrix: - CONDA_LOC: "C:\\Miniconda-x64" + PATCH_VS2008: "1" - CONDA_LOC: "C:\\Miniconda35-x64" + PATCH_VS2008: "0" - CONDA_LOC: "C:\\Miniconda36-x64" + PATCH_VS2008: "0" install: # This breaks conda-build because of git @@ -29,9 +32,12 @@ install: - cmd: set PYTHONUNBUFFERED=1 - cmd: conda install -n root --yes conda conda-env conda-build anaconda-client # We borrow a trick from conda-forge to fix the VS2008 compiler - - cmd: conda config --append channels conda-forge - - cmd: conda install --yes vs2008_express_vc_python_patch - - cmd: call setup_x64 + - ps: | + if($env:PATCH_VS2008 -eq '1') { + cmd /c "conda config --append channels conda-forge 2>&1" + cmd /c "conda install --yes vs2008_express_vc_python_patch 2>&1" + cmd /c "call setup_x64 2>&1" + } build: off From 6d8c5db9bbed02fa86c6feeec19bc16cac03dcb6 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Fri, 13 Oct 2017 14:51:49 -0400 Subject: [PATCH 572/597] Fix device number formatting for OpenCL. --- src/gpuarray_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index e38c19795f..9eb9ca0ce4 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -56,7 +56,7 @@ int gpucontext_props_cuda_dev(gpucontext_props *p, int devno) { } int gpucontext_props_opencl_dev(gpucontext_props *p, int platno, int devno) { - p->dev = platno << 16 || devno; + p->dev = (platno << 16) | devno; return GA_NO_ERROR; } From cc6c4fc8da3b07b42df0a5d4fa8e07176127a9e1 Mon Sep 17 00:00:00 2001 From: Pierre Ambrosini Date: Thu, 26 Oct 2017 20:02:14 +0200 Subject: [PATCH 573/597] add CUDA 9 version when trying to load libnvrtc --- src/gpuarray_buffer_cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 34aeb512aa..7dfe800b29 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -176,7 +176,7 @@ static int setup_lib(void) { res = load_libnvrtc(major, minor, global_err); if (res != GA_NO_ERROR) { /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */ - int versions[][2] = {{8, 0}, {7, 5}, {7, 0}}; + int versions[][2] = {{9, 0}, {8, 0}, {7, 5}, {7, 0}}; int versions_length = sizeof(versions) / sizeof(versions[0]); int i = 0; /* Skip versions that are higher or equal to the driver version */ From db7206150166a529fbd74cbab910ba4b0f76bbf5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 14 Dec 2017 16:49:29 -0500 Subject: [PATCH 574/597] Add 9.1 to the list of versions that windows looks for. --- src/gpuarray_buffer_cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 7dfe800b29..f518ce901b 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -176,7 +176,7 @@ static int setup_lib(void) { res = load_libnvrtc(major, minor, global_err); if (res != GA_NO_ERROR) { /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */ - int versions[][2] = {{9, 0}, {8, 0}, {7, 5}, {7, 0}}; + int versions[][2] = {{9, 1}, {9, 0}, {8, 0}, {7, 5}, {7, 0}}; int versions_length = sizeof(versions) / sizeof(versions[0]); int i = 0; /* Skip versions that are higher or equal to the driver version */ From 52074ef493b1f346a683e7006787c8b0bfa7f44f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 18 Dec 2017 14:05:20 -0500 Subject: [PATCH 575/597] Apparently travis needs an explicit version of pip. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e8236b2f4e..635c1ba107 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ before_install: - export PREFIX=$HOME/.local - brew update && brew install doxygen - export PYTHONUSERBASE=$PREFIX - - pip install --user breathe sphinx==1.6.3 sphinx_rtd_theme cython numpy 'mako>=0.7' six + - pip2 install --user breathe sphinx==1.6.3 sphinx_rtd_theme cython numpy 'mako>=0.7' six - export PATH=$PATH:$PREFIX/bin - export CPATH=$CPATH:$PREFIX/include - export LIBRARY_PATH=$LIBRARY_PATH:$PREFIX/lib From 1f4457fc85fd37a6b8a46cb5eada9c7bc46f537d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 8 Jan 2018 14:01:45 -0500 Subject: [PATCH 576/597] Add possible cause for the Library not initialized error message from cublas. --- src/gpuarray_blas_cuda_cublas.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index ba4ac2b37a..890ecc5ee9 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -53,7 +53,8 @@ static const char *estr(cublasStatus_t err) { static inline int error_cublas(error *e, const char *msg, cublasStatus_t err) { return error_fmt(e, (err == CUBLAS_STATUS_ARCH_MISMATCH) ? GA_DEVSUP_ERROR : GA_BLAS_ERROR, - "%s: %s", msg, estr(err)); + "%s: %s %s", msg, estr(err), + err == CUBLAS_STATUS_NOT_INITIALIZED ? "(Possibly because the driver version is too old for the cuda version)" : ""); } #define CUBLAS_EXIT_ON_ERROR(ctx, cmd) do { \ From cda95ed738e4fc08b22826fc9e96b72a00f08a4d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 8 Jan 2018 14:26:07 -0500 Subject: [PATCH 577/597] Move the space around. --- src/gpuarray_blas_cuda_cublas.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 890ecc5ee9..33abb44ea2 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -53,8 +53,8 @@ static const char *estr(cublasStatus_t err) { static inline int error_cublas(error *e, const char *msg, cublasStatus_t err) { return error_fmt(e, (err == CUBLAS_STATUS_ARCH_MISMATCH) ? GA_DEVSUP_ERROR : GA_BLAS_ERROR, - "%s: %s %s", msg, estr(err), - err == CUBLAS_STATUS_NOT_INITIALIZED ? "(Possibly because the driver version is too old for the cuda version)" : ""); + "%s: %s%s", msg, estr(err), + err == CUBLAS_STATUS_NOT_INITIALIZED ? " (Possibly because the driver version is too old for the cuda version)" : ""); } #define CUBLAS_EXIT_ON_ERROR(ctx, cmd) do { \ From 26075ec6f6d848fe34b026a67b1214865a3a70ba Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 10 Jan 2018 09:46:21 -0500 Subject: [PATCH 578/597] Small doc update --- doc/installation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 17e189987d..dfd96f5ace 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -123,7 +123,7 @@ computer, you can install them in your home directory like this: :: cd - rm -rf ~/.local/lib/libgpuarray* ~/.local/include/gpuarray + rm -rf ~/.local/lib/libgpuarray* ~/.local/include/gpuarray ~/.local/lib/python*/site-packages/pygpu* rm -rf build Build mkdir Build cd Build @@ -217,7 +217,7 @@ If you get an error message similar to this one: This means either you don't have check installed or it wasn't found by the cmake detection script. -To run the python tests, install pygpu, then move outside its +To run the python tests, install pygpu, then **move outside** its directory and run this command: :: From 5dfbff30ddf27947e78c517aec09e91633b9cd09 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 10 Jan 2018 09:52:10 -0500 Subject: [PATCH 579/597] Re-raise the error when needed. --- pygpu/gpuarray.pyx | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index a1e38bc4cf..5b9dd89fd4 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -634,15 +634,22 @@ def init(dev, sched='default', single_stream=False, kernel_cache_path=None, raise MemoryError try: if sched == 'single': - gpucontext_props_sched(p, GA_CTX_SCHED_SINGLE) + err = gpucontext_props_sched(p, GA_CTX_SCHED_SINGLE) elif sched == 'multi': - gpucontext_props_sched(p, GA_CTX_SCHED_MULTI) + err = gpucontext_props_sched(p, GA_CTX_SCHED_MULTI) elif sched != 'default': raise TypeError('unexpected value for parameter sched: %s' % (sched,)) + if err != GA_NO_ERROR: + raise get_exc(err), gpucontext_error(NULL, err) + if kernel_cache_path: kernel_cache_path_b = _s(kernel_cache_path) gpucontext_props_kernel_cache(p, kernel_cache_path_b) - gpucontext_props_alloc_cache(p, initial_cache_size, max_cache_size) + + err = gpucontext_props_alloc_cache(p, initial_cache_size, + max_cache_size) + if err != GA_NO_ERROR: + raise get_exc(err), gpucontext_error(NULL, err) if single_stream: gpucontext_props_set_single_stream(p); except: From a2d404d09d1759b29e5e5ee3bcf9d19148a53dee Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 30 Jan 2018 11:50:03 -0500 Subject: [PATCH 580/597] Make the life of people regularly running debug build easier. --- src/util/error.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/error.c b/src/util/error.c index 19ce184363..b523eccf6d 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -24,7 +24,7 @@ int error_set(error *e, int code, const char *msg) { e->code = code; strlcpy(e->msg, msg, ERROR_MSGBUF_LEN); #ifdef DEBUG - fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg); + fprintf(stderr, "(Debug) ERROR %d: %s\n", e->code, e->msg); #endif return code; } @@ -37,7 +37,7 @@ int error_fmt(error *e, int code, const char *fmt, ...) { vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap); va_end(ap); #ifdef DEBUG - fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg); + fprintf(stderr, "(Debug) ERROR %d: %s\n", e->code, e->msg); #endif return code; } From ac58a79071473fbb31f8b5b23a751e8756f2fe14 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Wed, 4 Apr 2018 21:41:24 -0400 Subject: [PATCH 581/597] cluda_opencl: Use round to nearest even to match cpu version Fixes test_elemwise_f16 add/iadd on amdgcn and possibly other OpenCL implementations. Fixes #462 Signed-off-by: Jan Vesely --- src/cluda_opencl.h | 2 +- src/cluda_opencl.h.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluda_opencl.h b/src/cluda_opencl.h index 6e0095c87f..8997fcb465 100644 --- a/src/cluda_opencl.h +++ b/src/cluda_opencl.h @@ -50,7 +50,7 @@ typedef struct _ga_half { #define ga_half2float(p) vload_half(0, &((p).data)) static inline ga_half ga_float2half(ga_float f) { ga_half r; - vstore_half_rtn(f, 0, &r.data); + vstore_half_rte(f, 0, &r.data); return r; } diff --git a/src/cluda_opencl.h.c b/src/cluda_opencl.h.c index ced789f979..4d6785c40c 100644 --- a/src/cluda_opencl.h.c +++ b/src/cluda_opencl.h.c @@ -122,7 +122,7 @@ static const char cluda_opencl_h[] = { 0x6f, 0x61, 0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x76, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x5f, 0x72, 0x74, 0x6e, 0x28, 0x66, 0x2c, 0x20, 0x30, +0x6c, 0x66, 0x5f, 0x72, 0x74, 0x65, 0x28, 0x66, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x26, 0x72, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x23, 0x70, 0x72, 0x61, 0x67, 0x6d, From 0c25f2cf2e233cccba1be84654253a28be2cd474 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Thu, 5 Apr 2018 15:12:31 -0400 Subject: [PATCH 582/597] pygpu/tests: Do not compare gpudata if the context is not CUDA Fixes #491 v2: Do not skip the entire test Signed-off-by: Jan Vesely --- pygpu/tests/test_gpu_ndarray.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pygpu/tests/test_gpu_ndarray.py b/pygpu/tests/test_gpu_ndarray.py index a7fc03a611..ef44cbcd41 100644 --- a/pygpu/tests/test_gpu_ndarray.py +++ b/pygpu/tests/test_gpu_ndarray.py @@ -195,7 +195,7 @@ def asfortranarray(shp, dtype, offseted_outer, offseted_inner, sliced, order): # numpy upcast with a view to 1d scalar. if gpu.flags['F_CONTIGUOUS']: - assert b.gpudata == gpu.gpudata + assert ctx.kind != b'cuda' or b.gpudata == gpu.gpudata elif (sliced != 1 or shp == () or (offseted_outer and len(shp) > 1) or (order != 'f' and len(shp) > 1)): assert b is not gpu @@ -286,7 +286,8 @@ def test_mapping_getitem_ellipsis(): def mapping_getitem_ellipsis(shp, dtype, offseted): a, a_gpu = gen_gpuarray(shp, dtype, offseted, ctx=ctx) b = a_gpu[...] - assert b.gpudata == a_gpu.gpudata + if ctx.kind == b'cuda': + assert b.gpudata == a_gpu.gpudata assert b.strides == a.strides assert b.shape == a.shape b_cpu = numpy.asarray(b) From 07cd4ad56054c279442ee28413b26939f4c03632 Mon Sep 17 00:00:00 2001 From: Wong Hang Date: Sat, 20 Oct 2018 22:02:26 +0800 Subject: [PATCH 583/597] add float64 support to tril triu --- pygpu/basic.py | 17 ++++++++------- pygpu/tests/test_basic.py | 46 ++++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/pygpu/basic.py b/pygpu/basic.py index e054daade0..473e9dc85f 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -1,12 +1,11 @@ from string import Template -from .gpuarray import GpuArray, GpuKernel, SIZE +from .gpuarray import GpuArray, GpuKernel, SIZE, dtype_to_ctype - -def _generate_kernel(ctx, cols, upper=True): +def _generate_kernel(ctx, cols, ctype, upper=True): tmpl = Template(""" #include "cluda.h" - KERNEL void extract_tri(GLOBAL_MEM ga_float *a, ga_size a_off, ga_uint N) { - a = (GLOBAL_MEM ga_float *)(((GLOBAL_MEM char *)a) + a_off); + KERNEL void extract_tri(GLOBAL_MEM ${ctype} *a, ga_size a_off, ga_uint N) { + a = (GLOBAL_MEM ${ctype} *)(((GLOBAL_MEM char *)a) + a_off); unsigned int idx = GID_1 * LDIM_0 * GDIM_0 + GID_0 * LDIM_0 + LID_0; unsigned int ix = idx/${cols}; @@ -21,7 +20,7 @@ def _generate_kernel(ctx, cols, upper=True): le = '>' else: le = '<' - src = tmpl.substitute(cols=cols, le=le) + src = tmpl.substitute(cols=cols, ctype=ctype, le=le) spec = [GpuArray, SIZE, 'uint32'] k = GpuKernel(src, "extract_tri", spec, context=ctx) return k @@ -41,7 +40,8 @@ def triu(A, inplace=True): else: upper = True cols = A.shape[1] - k = _generate_kernel(A.context, cols, upper) + ctype = dtype_to_ctype(A.dtype) + k = _generate_kernel(A.context, cols, ctype, upper) k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A @@ -60,6 +60,7 @@ def tril(A, inplace=True): else: upper = False cols = A.shape[1] - k = _generate_kernel(A.context, cols, upper) + ctype = dtype_to_ctype(A.dtype) + k = _generate_kernel(A.context, cols, ctype, upper) k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A diff --git a/pygpu/tests/test_basic.py b/pygpu/tests/test_basic.py index 040ea228fa..39064b0963 100644 --- a/pygpu/tests/test_basic.py +++ b/pygpu/tests/test_basic.py @@ -7,31 +7,33 @@ def test_tril(): - for shape in [(10, 5), (5, 10), (10, 10)]: - for order in ['c', 'f']: - for inplace in [True, False]: - ac, ag = gen_gpuarray(shape, 'float32', - order=order, ctx=context) - result = tril(ag, inplace=inplace) - assert numpy.all(numpy.tril(ac) == result) - if inplace: - assert numpy.all(numpy.tril(ac) == ag) - else: - assert numpy.all(ac == ag) + for dtype in ['float32','float64']: + for shape in [(10, 5), (5, 10), (10, 10)]: + for order in ['c', 'f']: + for inplace in [True, False]: + ac, ag = gen_gpuarray(shape, dtype, + order=order, ctx=context) + result = tril(ag, inplace=inplace) + assert numpy.all(numpy.tril(ac) == result) + if inplace: + assert numpy.all(numpy.tril(ac) == ag) + else: + assert numpy.all(ac == ag) def test_triu(): - for shape in [(10, 5), (5, 10), (10, 10)]: - for order in ['c', 'f']: - for inplace in [True, False]: - ac, ag = gen_gpuarray(shape, 'float32', - order=order, ctx=context) - result = triu(ag, inplace=inplace) - assert numpy.all(numpy.triu(ac) == result) - if inplace: - assert numpy.all(numpy.triu(ac) == ag) - else: - assert numpy.all(ac == ag) + for dtype in ['float32','float64']: + for shape in [(10, 5), (5, 10), (10, 10)]: + for order in ['c', 'f']: + for inplace in [True, False]: + ac, ag = gen_gpuarray(shape, dtype, + order=order, ctx=context) + result = triu(ag, inplace=inplace) + assert numpy.all(numpy.triu(ac) == result) + if inplace: + assert numpy.all(numpy.triu(ac) == ag) + else: + assert numpy.all(ac == ag) class test_errors(TestCase): From 450e6c00580ec322d8408b081b08f082de73038f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 5 Nov 2018 13:52:08 -0500 Subject: [PATCH 584/597] Support more recent versions and make sure to not always report errors for nvrtc64_70 --- src/gpuarray_buffer_cuda.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index f518ce901b..bed012d6d5 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -176,7 +176,7 @@ static int setup_lib(void) { res = load_libnvrtc(major, minor, global_err); if (res != GA_NO_ERROR) { /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */ - int versions[][2] = {{9, 1}, {9, 0}, {8, 0}, {7, 5}, {7, 0}}; + int versions[][2] = {{10, 0}, {9, 2}, {9, 1}, {9, 0}, {8, 0}, {7, 5}, {7, 0}}; int versions_length = sizeof(versions) / sizeof(versions[0]); int i = 0; /* Skip versions that are higher or equal to the driver version */ @@ -190,7 +190,8 @@ static int setup_lib(void) { } while (res != GA_NO_ERROR && i < versions_length); } if (res != GA_NO_ERROR) - return res; + // Return the error from the original attempt + return load_libnvrtc(major, minor, global_err); setup_done = 1; } return GA_NO_ERROR; From c3dae992336deb9ab306fdd35016e145b0945434 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 14 Nov 2018 14:22:22 -0500 Subject: [PATCH 585/597] Actually use the origin versions for the reported error --- src/gpuarray_buffer_cuda.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index bed012d6d5..772e9757f0 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -159,6 +159,7 @@ static int minor = -1; static int setup_lib(void) { CUresult err; int res, tmp; + int orig_major, orig_minor; if (!setup_done) { res = load_libcuda(global_err); @@ -174,6 +175,8 @@ static int setup_lib(void) { minor = (tmp / 10) % 10; /* Let's try to load a nvrtc corresponding to detected CUDA version. */ res = load_libnvrtc(major, minor, global_err); + orig_major = major; + orig_minor = minor; if (res != GA_NO_ERROR) { /* Else, let's try to find a nvrtc corresponding to supported CUDA versions. */ int versions[][2] = {{10, 0}, {9, 2}, {9, 1}, {9, 0}, {8, 0}, {7, 5}, {7, 0}}; @@ -191,7 +194,7 @@ static int setup_lib(void) { } if (res != GA_NO_ERROR) // Return the error from the original attempt - return load_libnvrtc(major, minor, global_err); + return load_libnvrtc(orig_major, orig_minor, global_err); setup_done = 1; } return GA_NO_ERROR; From bf1e694b1e4b6825216cc75c061551eb939c4162 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 13 Jan 2019 19:46:23 +0000 Subject: [PATCH 586/597] Don't assume size_t == cl_ulong Fixes 'Invalid value' error on 32 bit systems (issue 581) --- src/gpuarray_buffer_opencl.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 8f128115ee..e6701242fd 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1104,6 +1104,8 @@ static int cl_transfer(gpudata *dst, size_t dstoff, return error_set(dst->ctx->err, GA_UNSUPPORTED_ERROR, "Operation not supported"); } +#define clipto_sizet(x) (((x) < SIZE_MAX) ? (x) : SIZE_MAX) + static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cl_ctx *ctx = NULL; @@ -1132,6 +1134,7 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, switch (prop_id) { size_t sz; size_t *psz; + cl_ulong ul; cl_device_id id; cl_uint ui; @@ -1149,8 +1152,8 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_LOCAL_MEM_SIZE, - sizeof(sz), &sz, NULL)); - *((size_t *)res) = sz; + sizeof(ul), &ul, NULL)); + *((size_t *)res) = clipto_sizet(ul); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: @@ -1173,8 +1176,8 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_GLOBAL_MEM_SIZE, - sizeof(sz), &sz, NULL)); - *((size_t *)res) = sz; + sizeof(ul), &ul, NULL)); + *((size_t *)res) = clipto_sizet(ul); return GA_NO_ERROR; case GA_CTX_PROP_FREE_GMEM: @@ -1184,8 +1187,8 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id, CL_CHECK(ctx->err, clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id, NULL)); CL_CHECK(ctx->err, clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, - sizeof(sz), &sz, NULL)); - *((size_t *)res) = sz; + sizeof(ul), &ul, NULL)); + *((size_t *)res) = clipto_sizet(ul); return GA_NO_ERROR; case GA_CTX_PROP_NATIVE_FLOAT16: From 1b3ce42804da1750dbfd660bfcee5a28a51fe400 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 13 Jan 2019 19:51:08 +0000 Subject: [PATCH 587/597] Fix spelling --- pygpu/gpuarray.pyx | 14 +++++++------- src/gpuarray_blas_cuda_cublas.c | 2 +- src/gpuarray_blas_opencl_clblas.c | 2 +- src/gpuarray_blas_opencl_clblast.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx index 5b9dd89fd4..46c5d3ea01 100644 --- a/pygpu/gpuarray.pyx +++ b/pygpu/gpuarray.pyx @@ -76,10 +76,10 @@ def cuda_wrap_ctx(size_t ptr, bint own): Wrap an existing CUDA driver context (CUcontext) into a GpuContext class. - If `own` is true, libgpuarray is now reponsible for the context and + If `own` is true, libgpuarray is now responsible for the context and it will be destroyed once there are no references to it. Otherwise, the context will not be destroyed and it is the calling - code's reponsability. + code's responsibility. """ cdef gpucontext *(*cuda_make_ctx)(void *, int) cdef int flags @@ -876,7 +876,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None, Notes ----- - This function might be deprecated in a later relase since the only + This function might be deprecated in a later release since the only way to create gpudata pointers is through libgpuarray functions that aren't exposed at the python level. It can be used with the value of the `gpudata` attribute of an existing GpuArray. @@ -940,7 +940,7 @@ def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0, match its properties and `copy` is False. The properties of the resulting array depend on the input data - except if overriden by other parameters. + except if overridden by other parameters. This function is similar to :meth:`numpy.array` except that it returns GpuArrays. @@ -1561,7 +1561,7 @@ cdef class GpuArray: To create instances of this class use :meth:`~pygpu.gpuarray.zeros`, :meth:`~pygpu.gpuarray.empty` or - :meth:`~pygpu.gpuarray.array`. It cannot be instanciated + :meth:`~pygpu.gpuarray.array`. It cannot be instantiated directly. You can also subclass this class and make the module create your @@ -2278,7 +2278,7 @@ cdef class GpuKernel: The `have_*` parameter are there to tell libgpuarray that we need the particular type or feature to work for this kernel. If the - request can't be satified a :class:`.UnsupportedException` will be + request can't be satisfied a :class:`.UnsupportedException` will be raised in the constructor. Once you have the kernel object you can simply call it like so:: @@ -2335,7 +2335,7 @@ cdef class GpuKernel: If you do not set the `have_` flags properly, you will either get a device-specific error (the good case) or silent - completly bogus data (the bad case). + completely bogus data (the bad case). """ diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 33abb44ea2..935f106d2d 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -31,7 +31,7 @@ static const char *estr(cublasStatus_t err) { case CUBLAS_STATUS_NOT_INITIALIZED: return "(cublas) Library not initialized."; case CUBLAS_STATUS_ALLOC_FAILED: - return "(cublas) GPU ressource allocation failed."; + return "(cublas) GPU resource allocation failed."; case CUBLAS_STATUS_INVALID_VALUE: return "(cublas) Invalid value."; case CUBLAS_STATUS_ARCH_MISMATCH: diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index e1a7128b13..33e5291a2c 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -75,7 +75,7 @@ static const char *estr(clblasStatus err) { case clblasInsufficientMemVecY: return "memory object for vector Y is too small"; default: - return "Unknow error"; + return "Unknown error"; } } diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 8bd056e61e..a320edb4c2 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -85,7 +85,7 @@ static const char *estr(CLBlastStatusCode err) { case CLBlastUnexpectedError: return "Unexpected error"; default: - return "Unknow error"; + return "Unknown error"; } } From 2629992d91e62761549e1f7971205daf1af2016a Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 26 Jan 2019 18:35:07 +0000 Subject: [PATCH 588/597] Fix stack smashing crash in *gemmBatch Calling ARRAY_INIT more than 3 times without resetting num_ev runs off the end of evl. --- src/gpuarray_blas_opencl_clblas.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 33e5291a2c..004e7adb86 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -132,6 +132,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_uint num_ev = 0; for (i = 0; i < batchCount; i++) { + num_ev = 0; ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); @@ -163,6 +164,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, cl_uint num_ev = 0; for (i = 0; i < batchCount; i++) { + num_ev = 0; ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); From b55ba6ad179823b2463d236ed9c483f4e18305e3 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 26 Jan 2019 18:35:52 +0000 Subject: [PATCH 589/597] Don't corrupt lru_cache on exceptions If user_function raises an exception, avoid adding a key to last_use and not cache, as such keys trigger KeyError when purged. --- pygpu/tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pygpu/tools.py b/pygpu/tools.py index b0c43cb028..5e877e7893 100644 --- a/pygpu/tools.py +++ b/pygpu/tools.py @@ -177,7 +177,6 @@ def decorating_function(user_function): @functools.wraps(user_function) def wrapper(*key): time[0] += 1 - last_use[key] = time[0] try: result = cache[key] @@ -189,11 +188,12 @@ def wrapper(*key): # purge least recently used cache entries if len(cache) > wrapper.maxsize: - for key, _ in nsmallest(wrapper.maxsize // 10, + for key0, _ in nsmallest(wrapper.maxsize // 10, six.iteritems(last_use), key=itemgetter(1)): - del cache[key], last_use[key] + del cache[key0], last_use[key0] + last_use[key] = time[0] return result def clear(): From 4c4ff973c861e261d97488149fe1df97c259dd36 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 26 Jan 2019 18:39:09 +0000 Subject: [PATCH 590/597] Fix typo --- src/gpuarray_buffer_opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index e6701242fd..b1367cd6df 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -275,7 +275,7 @@ gpudata *cl_make_buf(gpucontext *c, cl_mem buf) { CL_CHECKN(ctx->err, clGetMemObjectInfo(buf, CL_MEM_CONTEXT, sizeof(buf_ctx), &buf_ctx, NULL)); if (buf_ctx != ctx->ctx) { - error_set(ctx->err, GA_VALUE_ERROR, "Requested context doesn't macth object context"); + error_set(ctx->err, GA_VALUE_ERROR, "Requested context doesn't match object context"); return NULL; } From bb9509cc57e1d48922abe473d33fd4e67e2f339f Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 26 Jan 2019 18:42:17 +0000 Subject: [PATCH 591/597] Actually use all the clblasStatus-to-error-string table Error code -1024 is the lowest (most negative), not the highest --- src/gpuarray_blas_opencl_clblas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 004e7adb86..7fd0c74145 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -35,7 +35,7 @@ static inline clblasTranspose convT(cb_transpose trans) { static unsigned int refcnt = 0; static const char *estr(clblasStatus err) { - if (err > -1024) + if (err > -900) return cl_error_string((cl_int)err); switch (err) { case clblasNotImplemented: From 79c808275d8ab958aeaa8ac0ff0f1cfb09c0bfb0 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 26 Jan 2019 21:23:02 +0000 Subject: [PATCH 592/597] Give a non-cryptic error when double precision is not available --- src/gpuarray_blas_opencl_clblas.c | 6 +++++- src/gpuarray_buffer_opencl.c | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 7fd0c74145..d85526df1a 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -35,8 +35,12 @@ static inline clblasTranspose convT(cb_transpose trans) { static unsigned int refcnt = 0; static const char *estr(clblasStatus err) { - if (err > -900) + if (err > -900) { + if (err == CL_INVALID_DEVICE) { + return "Invalid device, or double precision requested on a device that does not support double precision"; + } return cl_error_string((cl_int)err); + } switch (err) { case clblasNotImplemented: return "Unimplemented feature"; diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index b1367cd6df..75da423c3e 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -781,7 +781,9 @@ static int cl_check_extensions(const char **preamble, unsigned int *count, (*count)++; } if (flags & GA_USE_DOUBLE) { - GA_CHECK(check_ext(ctx, CL_DOUBLE)); + if (check_ext(ctx, CL_DOUBLE) != GA_NO_ERROR) { + return error_set(ctx->err, GA_DEVSUP_ERROR, "This device does not support double precision (pygpu int/int, int32+float32, and floating point literals default to double precision)"); + } preamble[*count] = PRAGMA CL_DOUBLE ENABLE; (*count)++; } From fed0af34382bb789fb2aa5e8ae83e9fa26c3cdac Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 26 Jan 2019 21:31:33 +0000 Subject: [PATCH 593/597] elemwise: pass through error messages --- pygpu/_elemwise.pyx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pygpu/_elemwise.pyx b/pygpu/_elemwise.pyx index 713e241e8a..d22a489f5d 100644 --- a/pygpu/_elemwise.pyx +++ b/pygpu/_elemwise.pyx @@ -15,6 +15,11 @@ cdef bytes to_bytes(s): return (s).encode('ascii') raise TypeError("Can't convert to bytes") +cdef extern from "gpuarray/buffer.h": + ctypedef struct gpucontext: + pass + char *gpucontext_error(gpucontext *ctx, int err) + cdef extern from "gpuarray/elemwise.h": ctypedef struct _GpuElemwise "GpuElemwise": pass @@ -141,7 +146,8 @@ cdef class GpuElemwise: finally: free(_args) if self.ge is NULL: - raise GpuArrayException("Could not initialize C GpuElemwise instance") + error_message = gpucontext_error(ctx.ctx, 0).decode(encoding='latin-1') + raise GpuArrayException("Could not initialize C GpuElemwise instance: " + error_message) def __dealloc__(self): cdef unsigned int i From fd0a210a95a3c929ca1c3667cb796ae4c7d6d923 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 26 Jan 2019 21:31:51 +0000 Subject: [PATCH 594/597] elemwise: use UnsupportedException for no double support (matches elsewhere, and makes tests skip rather than fail for this) --- pygpu/_elemwise.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pygpu/_elemwise.pyx b/pygpu/_elemwise.pyx index d22a489f5d..3e71da7e1f 100644 --- a/pygpu/_elemwise.pyx +++ b/pygpu/_elemwise.pyx @@ -1,4 +1,4 @@ -from pygpu.gpuarray import GpuArrayException +from pygpu.gpuarray import GpuArrayException, UnsupportedException from pygpu.gpuarray cimport (gpucontext, GA_NO_ERROR, get_typecode, typecode_to_dtype, GpuContext, GpuArray, get_exc, gpuarray_get_elsize) @@ -147,7 +147,10 @@ cdef class GpuElemwise: free(_args) if self.ge is NULL: error_message = gpucontext_error(ctx.ctx, 0).decode(encoding='latin-1') - raise GpuArrayException("Could not initialize C GpuElemwise instance: " + error_message) + # getting the error type this way is fragile, but the alternative is breaking ABI + raise (UnsupportedException if + "This device does not support double precision" in error_message else + GpuArrayException)("Could not initialize C GpuElemwise instance: " + error_message) def __dealloc__(self): cdef unsigned int i From 818a588dcc51e483e98b95b62474a2c0aa07fa54 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 27 Jan 2019 13:24:47 +0000 Subject: [PATCH 595/597] add "no double support = skip, not fail" to more tests --- pygpu/tests/test_basic.py | 40 ++++++++++++++++------------- pygpu/tests/test_blas.py | 23 +++++++++++++---- pygpu/tests/test_elemwise.py | 47 +++++++++++++++++++---------------- pygpu/tests/test_reduction.py | 9 +++++-- 4 files changed, 74 insertions(+), 45 deletions(-) diff --git a/pygpu/tests/test_basic.py b/pygpu/tests/test_basic.py index 39064b0963..95869a8a50 100644 --- a/pygpu/tests/test_basic.py +++ b/pygpu/tests/test_basic.py @@ -2,7 +2,7 @@ from pygpu.basic import (tril, triu) from unittest import TestCase -from .support import (gen_gpuarray, context) +from .support import (guard_devsup, gen_gpuarray, context) import numpy @@ -11,14 +11,17 @@ def test_tril(): for shape in [(10, 5), (5, 10), (10, 10)]: for order in ['c', 'f']: for inplace in [True, False]: - ac, ag = gen_gpuarray(shape, dtype, - order=order, ctx=context) - result = tril(ag, inplace=inplace) - assert numpy.all(numpy.tril(ac) == result) - if inplace: - assert numpy.all(numpy.tril(ac) == ag) - else: - assert numpy.all(ac == ag) + yield run_tril, dtype, shape, order, inplace + +@guard_devsup +def run_tril(dtype, shape, order, inplace): + ac, ag = gen_gpuarray(shape, dtype, order=order, ctx=context) + result = tril(ag, inplace=inplace) + assert numpy.all(numpy.tril(ac) == result) + if inplace: + assert numpy.all(numpy.tril(ac) == ag) + else: + assert numpy.all(ac == ag) def test_triu(): @@ -26,14 +29,17 @@ def test_triu(): for shape in [(10, 5), (5, 10), (10, 10)]: for order in ['c', 'f']: for inplace in [True, False]: - ac, ag = gen_gpuarray(shape, dtype, - order=order, ctx=context) - result = triu(ag, inplace=inplace) - assert numpy.all(numpy.triu(ac) == result) - if inplace: - assert numpy.all(numpy.triu(ac) == ag) - else: - assert numpy.all(ac == ag) + yield run_triu, dtype, shape, order, inplace + +@guard_devsup +def run_triu(dtype, shape, order, inplace): + ac, ag = gen_gpuarray(shape, dtype, order=order, ctx=context) + result = triu(ag, inplace=inplace) + assert numpy.all(numpy.triu(ac) == result) + if inplace: + assert numpy.all(numpy.triu(ac) == ag) + else: + assert numpy.all(ac == ag) class test_errors(TestCase): diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index c945fdf2de..ac34efcc8b 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -14,6 +14,19 @@ raise SkipTest("no scipy blas to compare against") import pygpu.blas as gblas +from pygpu.gpuarray import (GpuArrayException, UnsupportedException) + +def guard_devsup_blasdouble(func): + def f(*args, **kwargs): + try: + func(*args, **kwargs) + except UnsupportedException as e: + raise SkipTest("operation not supported") + except GpuArrayException as e: + if 'float64' in args and "does not support double precision" in str(e): + raise SkipTest("double precision not supported") + raise + return f def test_dot(): @@ -25,7 +38,7 @@ def test_dot(): yield dot, 666, 'float32', False, False, overwrite, init_z -@guard_devsup +@guard_devsup_blasdouble def dot(N, dtype, offseted_i, sliced, overwrite, init_z): cX, gX = gen_gpuarray((N,), dtype, offseted_inner=offseted_i, sliced=sliced, ctx=context) @@ -61,7 +74,7 @@ def test_gemv(): overwrite, True, alpha, beta) -@guard_devsup +@guard_devsup_blasdouble def gemv(shp, dtype, order, trans, offseted_i, sliced, overwrite, init_y, alpha=1.0, beta=0.0): cA, gA = gen_gpuarray(shp, dtype, order=order, offseted_inner=offseted_i, @@ -109,7 +122,7 @@ def test_gemm(): (False, False), False, 1, overwrite, True, alpha, beta) -@guard_devsup +@guard_devsup_blasdouble def gemm(m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, init_res, alpha=1.0, beta=0.0): if trans[0]: @@ -153,7 +166,7 @@ def test_ger(): for init_res, overwrite in product(bools, bools): yield ger, 4, 5, 'float32', 'f', 1, 1, init_res, overwrite - +@guard_devsup_blasdouble def ger(m, n, dtype, order, sliced_x, sliced_y, init_res, overwrite=False): cX, gX = gen_gpuarray((m,), dtype, order, sliced=sliced_x, ctx=context) cY, gY = gen_gpuarray((n,), dtype, order, sliced=sliced_y, ctx=context) @@ -192,7 +205,7 @@ def test_rgemmBatch_3d(): (False, False), False, 1, overwrite, True, alpha, beta) -@guard_devsup +@guard_devsup_blasdouble def rgemmBatch_3d(b, m, n, k, dtype, order, trans, offseted_o, sliced, overwrite, init_res, alpha=1.0, beta=0.0): if trans[0]: diff --git a/pygpu/tests/test_elemwise.py b/pygpu/tests/test_elemwise.py index 6bc6f9995b..cd0d8006be 100644 --- a/pygpu/tests/test_elemwise.py +++ b/pygpu/tests/test_elemwise.py @@ -318,24 +318,29 @@ def broadcast(shapea, shapeb): def test_infinity(): for dtype in ['float32', 'float64']: - ac, ag = gen_gpuarray((2,), dtype, ctx=context, cls=elemary) - out_g = ag._empty_like_me() - flt = 'ga_float' if dtype == 'float32' else 'ga_double' - out_arg = arg('out', out_g.dtype, scalar=False, read=False, write=True) - preamble = _inf_preamb_tpl.render(flt=flt) - - # +infinity - ac[:] = numpy.inf - expr_inf = 'out = infinity()' - kernel = GpuElemwise(context, expr_inf, [out_arg], - preamble=preamble) - kernel(out_g) - assert numpy.array_equal(ac, numpy.asarray(out_g)) - - # -infinity - ac[:] = -numpy.inf - expr_neginf = 'out = neg_infinity()' - kernel = GpuElemwise(context, expr_neginf, [out_arg], - preamble=preamble) - kernel(out_g) - assert numpy.array_equal(ac, numpy.asarray(out_g)) + yield infinity, dtype + + +@guard_devsup +def infinity(dtype): + ac, ag = gen_gpuarray((2,), dtype, ctx=context, cls=elemary) + out_g = ag._empty_like_me() + flt = 'ga_float' if dtype == 'float32' else 'ga_double' + out_arg = arg('out', out_g.dtype, scalar=False, read=False, write=True) + preamble = _inf_preamb_tpl.render(flt=flt) + + # +infinity + ac[:] = numpy.inf + expr_inf = 'out = infinity()' + kernel = GpuElemwise(context, expr_inf, [out_arg], + preamble=preamble) + kernel(out_g) + assert numpy.array_equal(ac, numpy.asarray(out_g)) + + # -infinity + ac[:] = -numpy.inf + expr_neginf = 'out = neg_infinity()' + kernel = GpuElemwise(context, expr_neginf, [out_arg], + preamble=preamble) + kernel(out_g) + assert numpy.array_equal(ac, numpy.asarray(out_g)) diff --git a/pygpu/tests/test_reduction.py b/pygpu/tests/test_reduction.py index 4a4b5f6c2a..fbb6882d94 100644 --- a/pygpu/tests/test_reduction.py +++ b/pygpu/tests/test_reduction.py @@ -52,8 +52,13 @@ def test_red_big_array(): [False, True, False]]: yield red_array_sum, 'float32', (2000, 30, 100), redux - +# this test needs a guard_devsup because Python 'float' is double, +# and placing one directly on a test_* makes nose not know that it's a test def test_red_broadcast(): + red_broadcast() + +@guard_devsup +def red_broadcast(): from pygpu.tools import as_argument dtype = float @@ -78,7 +83,6 @@ def test_red_broadcast(): assert numpy.allclose(nz, numpy.asarray(gz)) - def test_reduction_ops(): for axis in [None, 0, 1]: for op in ['all', 'any']: @@ -88,6 +92,7 @@ def test_reduction_ops(): yield reduction_op, op, dtype, axis +@guard_devsup def reduction_op(op, dtype, axis): c, g = gen_gpuarray((2, 3), dtype=dtype, ctx=context, cls=elemary) From 89e1cad4fdcf3f951213fb81b66657fd660364df Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 27 Jan 2019 13:57:25 +0000 Subject: [PATCH 596/597] tril/u: pass have_double/etc flags, to get a user-friendly error if double precision isn't supported --- pygpu/basic.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/pygpu/basic.py b/pygpu/basic.py index 473e9dc85f..90ffd93b49 100644 --- a/pygpu/basic.py +++ b/pygpu/basic.py @@ -1,7 +1,8 @@ from string import Template from .gpuarray import GpuArray, GpuKernel, SIZE, dtype_to_ctype +import numpy -def _generate_kernel(ctx, cols, ctype, upper=True): +def _generate_kernel(ctx, cols, dtype, upper=True): tmpl = Template(""" #include "cluda.h" KERNEL void extract_tri(GLOBAL_MEM ${ctype} *a, ga_size a_off, ga_uint N) { @@ -20,9 +21,21 @@ def _generate_kernel(ctx, cols, ctype, upper=True): le = '>' else: le = '<' + ctype = dtype_to_ctype(dtype) src = tmpl.substitute(cols=cols, ctype=ctype, le=le) spec = [GpuArray, SIZE, 'uint32'] - k = GpuKernel(src, "extract_tri", spec, context=ctx) + have_small = False + have_double = False + have_complex = False + if dtype.itemsize < 4: + have_small = True + if dtype in [numpy.float64, numpy.complex128]: + have_double = True + if dtype in [numpy.complex64, numpy.complex128]: + have_complex = True + k = GpuKernel(src, "extract_tri", spec, context=ctx, + have_double=have_double, have_small=have_small, + have_complex=have_complex) return k @@ -40,8 +53,7 @@ def triu(A, inplace=True): else: upper = True cols = A.shape[1] - ctype = dtype_to_ctype(A.dtype) - k = _generate_kernel(A.context, cols, ctype, upper) + k = _generate_kernel(A.context, cols, A.dtype, upper) k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A @@ -60,7 +72,6 @@ def tril(A, inplace=True): else: upper = False cols = A.shape[1] - ctype = dtype_to_ctype(A.dtype) - k = _generate_kernel(A.context, cols, ctype, upper) + k = _generate_kernel(A.context, cols, A.dtype, upper) k(A, A.offset, A.shape[0] * A.shape[1], n=A.shape[0] * A.shape[1]) return A From 0a3354998b7899caea16e5f16dde9205004dc345 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 27 Jan 2019 14:11:27 +0000 Subject: [PATCH 597/597] test_dot: always check against CPU double precision Fixes intermittent failure due to rounding error in the reference --- pygpu/tests/test_blas.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pygpu/tests/test_blas.py b/pygpu/tests/test_blas.py index c945fdf2de..42630023d2 100644 --- a/pygpu/tests/test_blas.py +++ b/pygpu/tests/test_blas.py @@ -37,10 +37,9 @@ def dot(N, dtype, offseted_i, sliced, overwrite, init_z): else: gZ = None - if dtype == 'float32': - cr = fblas.sdot(cX, cY) - else: - cr = fblas.ddot(cX, cY) + # Always check against double precision: scipy's single precision + # has enough error that this sometimes fails when we're closer + cr = fblas.ddot(cX, cY) gr = gblas.dot(gX, gY, gZ, overwrite_z=overwrite) numpy.testing.assert_allclose(cr, numpy.asarray(gr), rtol=1e-6)