// this will be invoked by nvcc and compile GPU version #include #include "./ndarray_function.h" #include "./ndarray_function-inl.h" namespace mxnet { namespace ndarray { template<> void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx) { mshadow::Copy(to->FlatTo2D(), from.FlatTo2D(), static_cast*>(ctx.stream)); } template<> void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx) { mshadow::Copy(to->FlatTo2D(), from.FlatTo2D(), static_cast*>(ctx.stream)); } template<> void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx) { if (from_ctx.dev_id == to_ctx.dev_id) { mshadow::Copy(to->FlatTo2D(), from.FlatTo2D(), static_cast*>(ctx.stream)); } else { CHECK(from.CheckContiguous() && to->CheckContiguous()) << "copy across only support continugous memory"; mshadow::Stream *s = static_cast*>(ctx.stream); CHECK(s != NULL) << "need stream in GPU context"; cudaMemcpyPeerAsync(to->dptr_, to_ctx.dev_id, from.dptr_, from_ctx.dev_id, from.shape_.Size() * sizeof(real_t), s->stream_); } } } // namespace ndarray } // namespace mxnet