Skip to content

Commit 51b90c7

Browse files
committed
PERF: Do not make copies if the number of references is only 1
1 parent 0672f56 commit 51b90c7

2 files changed

Lines changed: 155 additions & 12 deletions

File tree

src/api/c/assign.cpp

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,19 @@ af_err af_assign_seq(af_array *out,
116116
ARG_ASSERT(2, (index[i].step>=0));
117117
}
118118

119-
af_array res;
120-
if (*out != lhs) AF_CHECK(af_copy_array(&res, lhs));
121-
else res = lhs;
119+
af_array res = 0;
120+
121+
if (*out != lhs) {
122+
int count = 0;
123+
AF_CHECK(af_get_data_ref_count(&count, lhs));
124+
if (count > 1) {
125+
AF_CHECK(af_copy_array(&res, lhs));
126+
} else {
127+
AF_CHECK(af_retain_array(&res, lhs));
128+
}
129+
} else {
130+
res = lhs;
131+
}
122132

123133
try {
124134

@@ -190,8 +200,17 @@ af_err af_assign_gen(af_array *out,
190200
ARG_ASSERT(1, (lhs!=0));
191201
ARG_ASSERT(4, (rhs!=0));
192202

193-
if (*out != lhs) AF_CHECK(af_copy_array(&output, lhs));
194-
else output = lhs;
203+
if (*out != lhs) {
204+
int count = 0;
205+
AF_CHECK(af_get_data_ref_count(&count, lhs));
206+
if (count > 1) {
207+
AF_CHECK(af_copy_array(&output, lhs));
208+
} else {
209+
AF_CHECK(af_retain_array(&output, lhs));
210+
}
211+
} else {
212+
output = lhs;
213+
}
195214

196215
ArrayInfo lInfo = getInfo(lhs);
197216
ArrayInfo rInfo = getInfo(rhs);

test/memory.cpp

Lines changed: 131 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -263,18 +263,15 @@ TEST(Memory, Assign)
263263
ASSERT_EQ(lock_bytes, 1 * step_bytes);
264264

265265
{
266-
// Should just a copy
267266
af::array b = af::randu(num / 2);
268267
a(af::seq(num / 2)) = b;
269268

270269
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
271270
&lock_bytes, &lock_buffers);
272271

273-
// FIXME: An extra buffer is used because of copy on write
274-
// Fix to not perform a copy when the buffer does not have children
275-
ASSERT_EQ(alloc_buffers, 3u);
272+
ASSERT_EQ(alloc_buffers, 2u);
276273
ASSERT_EQ(lock_buffers, 2u);
277-
ASSERT_EQ(alloc_bytes, 3 * step_bytes);
274+
ASSERT_EQ(alloc_bytes, 2 * step_bytes);
278275
ASSERT_EQ(lock_bytes, 2 * step_bytes);
279276
}
280277

@@ -283,13 +280,140 @@ TEST(Memory, Assign)
283280
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
284281
&lock_bytes, &lock_buffers);
285282

286-
ASSERT_EQ(alloc_buffers, 3u);
283+
ASSERT_EQ(alloc_buffers, 2u);
287284
ASSERT_EQ(lock_buffers, 1u);
288-
ASSERT_EQ(alloc_bytes, 3 * step_bytes);
285+
ASSERT_EQ(alloc_bytes, 2 * step_bytes);
289286
ASSERT_EQ(lock_bytes, 1 * step_bytes);
290287

291288
}
292289

290+
TEST(Memory, AssignLoop)
291+
{
292+
size_t alloc_bytes, alloc_buffers;
293+
size_t lock_bytes, lock_buffers;
294+
295+
cleanSlate(); // Clean up everything done so far
296+
297+
const int num = step_bytes / sizeof(float);
298+
const int cols = 100;
299+
300+
af::array a = af::randu(num, cols);
301+
302+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
303+
&lock_bytes, &lock_buffers);
304+
305+
ASSERT_EQ(alloc_buffers, 1u);
306+
ASSERT_EQ(lock_buffers, 1u);
307+
ASSERT_EQ(alloc_bytes, cols * step_bytes);
308+
ASSERT_EQ(lock_bytes, cols * step_bytes);
309+
310+
for (int i = 0; i < cols; i++) {
311+
312+
af::array b = af::randu(num);
313+
a(af::span, i) = b;
314+
315+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
316+
&lock_bytes, &lock_buffers);
317+
318+
ASSERT_EQ(alloc_buffers, 2u); // 3 because you need another scratch space for b
319+
ASSERT_EQ(lock_buffers, 2u);
320+
ASSERT_EQ(alloc_bytes, (cols + 1) * step_bytes);
321+
ASSERT_EQ(lock_bytes, (cols + 1) * step_bytes);
322+
}
323+
}
324+
325+
TEST(Memory, AssignRef)
326+
{
327+
size_t alloc_bytes, alloc_buffers;
328+
size_t lock_bytes, lock_buffers;
329+
330+
cleanSlate(); // Clean up everything done so far
331+
332+
const int num = step_bytes / sizeof(float);
333+
334+
af::array a = af::randu(num);
335+
af::array a_ref = a;
336+
337+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
338+
&lock_bytes, &lock_buffers);
339+
340+
ASSERT_EQ(alloc_buffers, 1u);
341+
ASSERT_EQ(lock_buffers, 1u);
342+
ASSERT_EQ(alloc_bytes, 1 * step_bytes);
343+
ASSERT_EQ(lock_bytes, 1 * step_bytes);
344+
345+
{
346+
af::array b = af::randu(num / 2);
347+
// This should do a full copy of a
348+
a(af::seq(num / 2)) = b;
349+
350+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
351+
&lock_bytes, &lock_buffers);
352+
353+
ASSERT_EQ(alloc_buffers, 3u);
354+
ASSERT_EQ(lock_buffers, 3u);
355+
ASSERT_EQ(alloc_bytes, 3 * step_bytes);
356+
ASSERT_EQ(lock_bytes, 3 * step_bytes);
357+
}
358+
359+
360+
// b should not have deleted a
361+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
362+
&lock_bytes, &lock_buffers);
363+
364+
ASSERT_EQ(alloc_buffers, 3u);
365+
ASSERT_EQ(lock_buffers, 2u); // a_ref
366+
ASSERT_EQ(alloc_bytes, 3 * step_bytes);
367+
ASSERT_EQ(lock_bytes, 2 * step_bytes); // a_ref
368+
369+
}
370+
371+
TEST(Memory, AssignRefLoop)
372+
{
373+
size_t alloc_bytes, alloc_buffers;
374+
size_t lock_bytes, lock_buffers;
375+
376+
cleanSlate(); // Clean up everything done so far
377+
378+
const int num = step_bytes / sizeof(float);
379+
const int cols = 100;
380+
381+
af::array a = af::randu(num, cols);
382+
af::array a_ref = a;
383+
384+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
385+
&lock_bytes, &lock_buffers);
386+
387+
ASSERT_EQ(alloc_buffers, 1u);
388+
ASSERT_EQ(lock_buffers, 1u);
389+
ASSERT_EQ(alloc_bytes, cols * step_bytes);
390+
ASSERT_EQ(lock_bytes, cols * step_bytes);
391+
392+
for (int i = 0; i < cols; i++) {
393+
394+
af::array b = af::randu(num);
395+
a(af::span, i) = b;
396+
397+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
398+
&lock_bytes, &lock_buffers);
399+
400+
ASSERT_EQ(alloc_buffers, 3u);
401+
ASSERT_EQ(lock_buffers, 3u);
402+
ASSERT_EQ(alloc_bytes, (2 * cols + 1) * step_bytes);
403+
ASSERT_EQ(lock_bytes, (2 * cols + 1) * step_bytes);
404+
}
405+
406+
407+
// b should not have deleted a
408+
af::deviceMemInfo(&alloc_bytes, &alloc_buffers,
409+
&lock_bytes, &lock_buffers);
410+
411+
ASSERT_EQ(alloc_buffers, 3u);
412+
ASSERT_EQ(lock_buffers, 2u); // a_ref
413+
ASSERT_EQ(alloc_bytes, (2 * cols + 1) * step_bytes);
414+
ASSERT_EQ(lock_bytes, 2 * cols * step_bytes); // a_ref
415+
416+
}
293417

294418
TEST(Memory, device)
295419
{

0 commit comments

Comments
 (0)