Skip to content

Commit 3581be2

Browse files
committed
Add ref_tracking option for object deduplication during serialization
When registering an extension type with ref_tracking: true, repeated objects are serialized as back-references instead of being re-encoded. This reduces payload size when the same object appears multiple times. Uses ext type 127 with a compact wire format: - New reference: nil marker followed by the serialized object - Back-reference: positive integer ref_id pointing to earlier object
1 parent 519b457 commit 3581be2

10 files changed

Lines changed: 495 additions & 1 deletion

File tree

bench/bench.rb

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
'status' => 200,
2323
'bytes' => 2326,
2424
'referer' => 'http://www.example.com/start.html',
25-
'agent' => 'Mozilla/4.08 [en] (Win98; I ;Nav)',
25+
'agent' => 'Mozilla/4.08 [en] (Win98; I ;Nav)'
2626
}
2727

2828
data_structured = MessagePack.pack(object_structured)
@@ -46,6 +46,41 @@ def self.from_msgpack_ext(data)
4646
extended_packer.register_type(0x00, Extended, :to_msgpack_ext)
4747
data_extended = extended_packer.pack(object_extended).to_s
4848

49+
# Struct for optimized_struct benchmarks
50+
BenchStruct = Struct.new(:name, :age, :email, :score, :active)
51+
52+
# Factory with optimized_struct (C-level fast path)
53+
factory_optimized = MessagePack::Factory.new
54+
factory_optimized.register_type(0x01, BenchStruct, optimized_struct: true)
55+
56+
# Factory with recursive packer/unpacker (Ruby-level)
57+
factory_recursive = MessagePack::Factory.new
58+
factory_recursive.register_type(
59+
0x01,
60+
BenchStruct,
61+
packer: lambda { |obj, packer|
62+
packer.write(obj.name)
63+
packer.write(obj.age)
64+
packer.write(obj.email)
65+
packer.write(obj.score)
66+
packer.write(obj.active)
67+
},
68+
unpacker: lambda { |unpacker|
69+
BenchStruct.new(unpacker.read, unpacker.read, unpacker.read, unpacker.read, unpacker.read)
70+
},
71+
recursive: true
72+
)
73+
74+
object_struct = BenchStruct.new('Alice', 30, 'alice@example.com', 95.5, true)
75+
data_struct_optimized = factory_optimized.dump(object_struct)
76+
data_struct_recursive = factory_recursive.dump(object_struct)
77+
78+
# Pre-create packers/unpackers for fair comparison (avoid factory overhead in loop)
79+
packer_optimized = factory_optimized.packer
80+
packer_recursive = factory_recursive.packer
81+
unpacker_optimized = factory_optimized.unpacker
82+
unpacker_recursive = factory_recursive.unpacker
83+
4984
Benchmark.ips do |x|
5085
x.report('pack-plain') do
5186
MessagePack.pack(object_plain)
@@ -61,6 +96,22 @@ def self.from_msgpack_ext(data)
6196
packer.pack(object_extended).to_s
6297
end
6398

99+
x.report('pack-struct-optimized') do
100+
packer_optimized.write(object_struct)
101+
packer_optimized.to_s
102+
packer_optimized.clear
103+
end
104+
105+
x.report('pack-struct-recursive') do
106+
packer_recursive.write(object_struct)
107+
packer_recursive.to_s
108+
packer_recursive.clear
109+
end
110+
111+
x.compare!
112+
end
113+
114+
Benchmark.ips do |x|
64115
x.report('unpack-plain') do
65116
MessagePack.unpack(data_plain)
66117
end
@@ -75,4 +126,18 @@ def self.from_msgpack_ext(data)
75126
unpacker.feed data_extended
76127
unpacker.read
77128
end
129+
130+
x.report('unpack-struct-optimized') do
131+
unpacker_optimized.feed(data_struct_optimized)
132+
unpacker_optimized.read
133+
unpacker_optimized.reset
134+
end
135+
136+
x.report('unpack-struct-recursive') do
137+
unpacker_recursive.feed(data_struct_recursive)
138+
unpacker_recursive.read
139+
unpacker_recursive.reset
140+
end
141+
142+
x.compare!
78143
end

ext/msgpack/factory_class.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ struct msgpack_factory_t {
3434
bool has_bigint_ext_type;
3535
bool has_symbol_ext_type;
3636
bool optimized_symbol_ext_type;
37+
bool has_ref_tracking_ext_type;
3738
int symbol_ext_type;
3839
};
3940

@@ -161,6 +162,7 @@ VALUE MessagePack_Factory_packer(int argc, VALUE* argv, VALUE self)
161162
msgpack_packer_ext_registry_borrow(packer, &fc->pkrg, &pk->ext_registry);
162163
pk->has_bigint_ext_type = fc->has_bigint_ext_type;
163164
pk->has_symbol_ext_type = fc->has_symbol_ext_type;
165+
pk->has_ref_tracking_ext_type = fc->has_ref_tracking_ext_type;
164166

165167
return packer;
166168
}
@@ -176,6 +178,7 @@ VALUE MessagePack_Factory_unpacker(int argc, VALUE* argv, VALUE self)
176178
msgpack_unpacker_ext_registry_borrow(fc->ukrg, &uk->ext_registry);
177179
uk->optimized_symbol_ext_type = fc->optimized_symbol_ext_type;
178180
uk->symbol_ext_type = fc->symbol_ext_type;
181+
uk->has_ref_tracking_ext_type = fc->has_ref_tracking_ext_type;
179182

180183
return unpacker;
181184
}
@@ -271,6 +274,12 @@ static VALUE Factory_register_type_internal(VALUE self, VALUE rb_ext_type, VALUE
271274
packer_proc = ext_module;
272275
unpacker_proc = ext_module;
273276
}
277+
278+
/* ref_tracking: true enables deduplication of repeated objects */
279+
if (RTEST(rb_hash_aref(options, ID2SYM(rb_intern("ref_tracking"))))) {
280+
flags |= MSGPACK_EXT_REF_TRACKING;
281+
fc->has_ref_tracking_ext_type = true;
282+
}
274283
}
275284

276285
msgpack_packer_ext_registry_put(self, &fc->pkrg, ext_module, ext_type, flags, packer_proc);

ext/msgpack/packer.c

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,17 @@
2626
void msgpack_packer_init(msgpack_packer_t* pk)
2727
{
2828
msgpack_buffer_init(PACKER_BUFFER_(pk));
29+
pk->ref_table = NULL;
30+
pk->next_ref_id = 1; /* 1-indexed */
2931
}
3032

3133
void msgpack_packer_destroy(msgpack_packer_t* pk)
3234
{
3335
msgpack_buffer_destroy(PACKER_BUFFER_(pk));
36+
if (pk->ref_table) {
37+
st_free_table(pk->ref_table);
38+
pk->ref_table = NULL;
39+
}
3440
}
3541

3642
void msgpack_packer_mark(msgpack_packer_t* pk)
@@ -46,6 +52,68 @@ void msgpack_packer_reset(msgpack_packer_t* pk)
4652
msgpack_buffer_clear(PACKER_BUFFER_(pk));
4753

4854
pk->buffer_ref = Qnil;
55+
56+
/* Reset ref tracking state */
57+
if (pk->ref_table) {
58+
st_clear(pk->ref_table);
59+
}
60+
pk->next_ref_id = 1;
61+
}
62+
63+
/*
64+
* Write a back-reference to a previously serialized object.
65+
* Wire format: ext type 127 followed by msgpack integer ref_id
66+
* We use fixext 1 with a 0 byte as a marker, then write the ref_id as a normal msgpack int.
67+
*/
68+
static void msgpack_packer_write_back_ref(msgpack_packer_t* pk, long ref_id)
69+
{
70+
/* fixext 1, type 127, payload 0x01 (marker for back-ref) */
71+
msgpack_buffer_ensure_writable(PACKER_BUFFER_(pk), 3);
72+
msgpack_buffer_write_2(PACKER_BUFFER_(pk), 0xd4, MSGPACK_EXT_REF_TYPE);
73+
msgpack_buffer_write_1(PACKER_BUFFER_(pk), 0x01);
74+
/* Write ref_id as a variable-length msgpack integer */
75+
msgpack_packer_write_long(pk, ref_id);
76+
}
77+
78+
/*
79+
* Write a new reference marker followed by the object.
80+
* Wire format: ext type 127 with payload = [nil, serialized_object]
81+
* The nil indicates this is a new ref (vs back-ref which has positive int).
82+
*/
83+
static void msgpack_packer_write_new_ref_header(msgpack_packer_t* pk)
84+
{
85+
/* We write: ext header (variable len) + nil (1 byte) + object (variable)
86+
* Since we don't know the total length yet, we use a different approach:
87+
* Write nil as ext payload marker, then the object follows in the stream.
88+
*
89+
* Actually, let's use fixext 1 with nil (0xc0) as the 1-byte payload.
90+
* The unpacker will see ext type 127 with payload [0xc0] and know it's a new ref,
91+
* then read the next object from the stream.
92+
*/
93+
msgpack_buffer_ensure_writable(PACKER_BUFFER_(pk), 3);
94+
msgpack_buffer_write_2(PACKER_BUFFER_(pk), 0xd4, MSGPACK_EXT_REF_TYPE); /* fixext 1, type 127 */
95+
msgpack_buffer_write_1(PACKER_BUFFER_(pk), 0xc0); /* nil marker */
96+
}
97+
98+
/*
99+
* Check if a value was already serialized and return its ref_id if so.
100+
* If not found, registers the value and returns 0.
101+
*/
102+
static long msgpack_packer_check_ref(msgpack_packer_t* pk, VALUE v)
103+
{
104+
if (!pk->ref_table) {
105+
pk->ref_table = st_init_numtable();
106+
}
107+
108+
st_data_t ref_id;
109+
if (st_lookup(pk->ref_table, (st_data_t)v, &ref_id)) {
110+
return (long)ref_id;
111+
}
112+
113+
/* Not found - register this value */
114+
st_insert(pk->ref_table, (st_data_t)v, (st_data_t)pk->next_ref_id);
115+
pk->next_ref_id++;
116+
return 0; /* 0 means "not a back-reference" */
49117
}
50118

51119

@@ -136,6 +204,18 @@ bool msgpack_packer_try_write_with_ext_type_lookup(msgpack_packer_t* pk, VALUE v
136204
return false;
137205
}
138206

207+
/* Handle ref_tracking: check if we've seen this object before */
208+
if (ext_flags & MSGPACK_EXT_REF_TRACKING) {
209+
long ref_id = msgpack_packer_check_ref(pk, v);
210+
if (ref_id > 0) {
211+
/* Already seen - write back-reference */
212+
msgpack_packer_write_back_ref(pk, ref_id);
213+
return true;
214+
}
215+
/* Not seen before - write new-ref header, then continue to serialize normally */
216+
msgpack_packer_write_new_ref_header(pk);
217+
}
218+
139219
if(ext_flags & MSGPACK_EXT_STRUCT_FAST_PATH) {
140220
/* Fast path for Struct: directly access fields in C, no Ruby callbacks */
141221
VALUE held_buffer = MessagePack_Buffer_hold(&pk->buffer);

ext/msgpack/packer.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
#include "buffer.h"
2222
#include "packer_ext_registry.h"
2323

24+
/* Extension type for reference tracking (used for deduplication) */
25+
#define MSGPACK_EXT_REF_TYPE 127
26+
2427
#ifndef MSGPACK_PACKER_IO_FLUSH_THRESHOLD_TO_WRITE_STRING_BODY
2528
#define MSGPACK_PACKER_IO_FLUSH_THRESHOLD_TO_WRITE_STRING_BODY (1024)
2629
#endif
@@ -44,6 +47,11 @@ struct msgpack_packer_t {
4447
bool compatibility_mode;
4548
bool has_bigint_ext_type;
4649
bool has_symbol_ext_type;
50+
bool has_ref_tracking_ext_type;
51+
52+
/* reference tracking for deduplication */
53+
st_table *ref_table; /* maps VALUE -> ref_id (1-indexed) */
54+
long next_ref_id;
4755

4856
/* options */
4957
bool comaptibility_mode;

ext/msgpack/packer_class.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,13 @@ static VALUE Packer_reset(VALUE self)
304304
{
305305
msgpack_packer_t *pk = MessagePack_Packer_get(self);
306306
msgpack_buffer_clear(PACKER_BUFFER_(pk));
307+
308+
/* Reset ref tracking state */
309+
if (pk->ref_table) {
310+
st_clear(pk->ref_table);
311+
}
312+
pk->next_ref_id = 1;
313+
307314
return Qnil;
308315
}
309316

ext/msgpack/packer_ext_registry.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#define MSGPACK_EXT_RECURSIVE 0b0001
2525
#define MSGPACK_EXT_STRUCT_FAST_PATH 0b0010
26+
#define MSGPACK_EXT_REF_TRACKING 0b0100
2627

2728
struct msgpack_packer_ext_registry_t;
2829
typedef struct msgpack_packer_ext_registry_t msgpack_packer_ext_registry_t;

0 commit comments

Comments
 (0)