Skip to content

Commit dcb0fa5

Browse files
committed
Add comm_error impl and remove gpucomm_get_device
1 parent 9838eca commit dcb0fa5

4 files changed

Lines changed: 5 additions & 21 deletions

File tree

src/gpuarray/buffer_collectives.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,6 @@ GPUARRAY_PUBLIC int gpucomm_gen_clique_id(gpucontext* ctx,
9393
*/
9494
GPUARRAY_PUBLIC int gpucomm_get_count(gpucomm* comm, int* count);
9595

96-
/**
97-
* \brief TODO
98-
* \param comm [gpucomm*] TODO
99-
* \param device [int*] TODO
100-
* \return int TODO
101-
*/
102-
GPUARRAY_PUBLIC int gpucomm_get_device(gpucomm* comm, int* device);
103-
10496
/**
10597
* \brief TODO
10698
* \param comm [gpucomm*] TODO

src/gpuarray_buffer_collectives.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,6 @@ int gpucomm_get_count(gpucomm* comm, int* count) {
4242
return ctx->comm_ops->get_count(comm, count);
4343
}
4444

45-
int gpucomm_get_device(gpucomm* comm, int* device) {
46-
gpucontext* ctx = gpucomm_context(comm);
47-
if (ctx->comm_ops == NULL)
48-
return GA_COMM_ERROR;
49-
return ctx->comm_ops->get_device(comm, device);
50-
}
51-
5245
int gpucomm_get_rank(gpucomm* comm, int* rank) {
5346
gpucontext* ctx = gpucomm_context(comm);
5447
if (ctx->comm_ops == NULL)

src/gpuarray_collectives_cuda_nccl.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,17 @@ static int comm_new(gpucomm** comm_ptr, gpucontext* ctx, gpucommCliqueId comm_id
4444
}
4545

4646
static void comm_free(gpucomm* comm) {
47+
ASSERT_COMM(comm);
4748
cuda_enter(comm->ctx);
4849
ncclCommDestroy(comm->c);
4950
cuda_exit(comm->ctx);
5051
comm_clear(comm);
5152
}
5253

53-
static const char* comm_error(gpucontext* ctx) {
54+
static const char* comm_error(gpucontext* c) {
55+
cuda_context* ctx = (cuda_context*) c;
56+
// find a way to concatenate a constact "(nccl) " infront
57+
return ncclGetErrorString(ctx->nccl_err);
5458
}
5559

5660
static int generate_clique_id(gpucontext* ctx, gpucommCliqueId* cliqueId) {
@@ -59,9 +63,6 @@ static int generate_clique_id(gpucontext* ctx, gpucommCliqueId* cliqueId) {
5963
static int get_count(const gpucomm* comm, int* count) {
6064
}
6165

62-
static int get_device(const gpucomm* comm, int* device) {
63-
}
64-
6566
static int get_rank(const gpucomm* comm, int* rank) {
6667
}
6768

@@ -99,7 +100,6 @@ GPUARRAY_LOCAL gpuarray_comm_ops nccl_ops = {
99100
comm_error,
100101
generate_clique_id,
101102
get_count,
102-
get_device,
103103
get_rank,
104104
reduce,
105105
all_reduce,

src/private.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ struct _gpuarray_comm_ops {
209209
const char* (*comm_error)(gpucontext* ctx);
210210
int (*generate_clique_id)(gpucontext* ctx, gpucommCliqueId* cliqueId);
211211
int (*get_count)(const gpucomm* comm, int* count);
212-
int (*get_device)(const gpucomm* comm, int* device);
213212
int (*get_rank)(const gpucomm* comm, int* rank);
214213
// collective ops
215214
int (*reduce)(const gpudata* src, size_t offsrc,

0 commit comments

Comments
 (0)