crush.h source code [linux/include/linux/crush/crush.h]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	#ifndef CEPH_CRUSH_CRUSH_H
3	#define CEPH_CRUSH_CRUSH_H
4
5	#ifdef __KERNEL__
6	# include <linux/rbtree.h>
7	# include <linux/types.h>
8	#else
9	# include "crush_compat.h"
10	#endif
11
12	/*
13	* CRUSH is a pseudo-random data distribution algorithm that
14	* efficiently distributes input values (typically, data objects)
15	* across a heterogeneous, structured storage cluster.
16	*
17	* The algorithm was originally described in detail in this paper
18	* (although the algorithm has evolved somewhat since then):
19	*
20	* https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
21	*
22	* LGPL2
23	*/
24
25
26	#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
27
28	#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
29	#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
30	#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
31
32	#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
33	#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
34
35	#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
36	#define CRUSH_ITEM_NONE 0x7fffffff /* no result */
37
38	/*
39	* CRUSH uses user-defined "rules" to describe how inputs should be
40	* mapped to devices. A rule consists of sequence of steps to perform
41	* to generate the set of output devices.
42	*/
43	struct crush_rule_step {
44	__u32 op;
45	__s32 arg1;
46	__s32 arg2;
47	};
48
49	/ step op codes /
50	enum {
51	CRUSH_RULE_NOOP = `0`,
52	CRUSH_RULE_TAKE = `1`, / arg1 = value to start with /
53	CRUSH_RULE_CHOOSE_FIRSTN = `2`, / arg1 = num items to pick /
54	/ arg2 = type /
55	CRUSH_RULE_CHOOSE_INDEP = `3`, / same /
56	CRUSH_RULE_EMIT = `4`, / no args /
57	CRUSH_RULE_CHOOSELEAF_FIRSTN = `6`,
58	CRUSH_RULE_CHOOSELEAF_INDEP = `7`,
59
60	CRUSH_RULE_SET_CHOOSE_TRIES = `8`, / override choose_total_tries /
61	CRUSH_RULE_SET_CHOOSELEAF_TRIES = `9`, / override chooseleaf_descend_once /
62	CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = `10`,
63	CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = `11`,
64	CRUSH_RULE_SET_CHOOSELEAF_VARY_R = `12`,
65	CRUSH_RULE_SET_CHOOSELEAF_STABLE = `13`
66	};
67
68	/*
69	* for specifying choose num (arg1) relative to the max parameter
70	* passed to do_rule
71	*/
72	#define CRUSH_CHOOSE_N 0
73	#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
74
75	/*
76	* The rule mask is used to describe what the rule is intended for.
77	* Given a ruleset and size of output set, we search through the
78	* rule list for a matching rule_mask.
79	*/
80	struct crush_rule_mask {
81	__u8 ruleset;
82	__u8 type;
83	__u8 min_size;
84	__u8 max_size;
85	};
86
87	struct crush_rule {
88	__u32 len;
89	struct crush_rule_mask mask;
90	struct crush_rule_step steps[];
91	};
92
93	#define crush_rule_size(len) (sizeof(struct crush_rule) + \
94	(len)*sizeof(struct crush_rule_step))
95
96
97
98	/*
99	* A bucket is a named container of other items (either devices or
100	* other buckets). Items within a bucket are chosen using one of a
101	* few different algorithms. The table summarizes how the speed of
102	* each option measures up against mapping stability when items are
103	* added or removed.
104	*
105	* Bucket Alg Speed Additions Removals
106	* ------------------------------------------------
107	* uniform O(1) poor poor
108	* list O(n) optimal poor
109	* tree O(log n) good good
110	* straw O(n) better better
111	* straw2 O(n) optimal optimal
112	*/
113	enum {
114	CRUSH_BUCKET_UNIFORM = `1`,
115	CRUSH_BUCKET_LIST = `2`,
116	CRUSH_BUCKET_TREE = `3`,
117	CRUSH_BUCKET_STRAW = `4`,
118	CRUSH_BUCKET_STRAW2 = `5`,
119	};
120	extern const char crush_bucket_alg_name(int* alg);
121
122	/*
123	* although tree was a legacy algorithm, it has been buggy, so
124	* exclude it.
125	*/
126	#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
127	(1 << CRUSH_BUCKET_UNIFORM) \| \
128	(1 << CRUSH_BUCKET_LIST) \| \
129	(1 << CRUSH_BUCKET_STRAW))
130
131	struct crush_bucket {
132	__s32 id; / this'll be negative /
133	__u16 type; / non-zero; type=0 is reserved for devices /
134	__u8 alg; / one of CRUSH_BUCKET_* /
135	__u8 hash; / which hash function to use, CRUSH_HASH_* /
136	__u32 weight; / 16-bit fixed point /
137	__u32 size; / num items /
138	__s32 *items;
139
140	};
141
142	/* @ingroup API*
143	*
144	* Replacement weights for each item in a bucket. The size of the
145	* array must be exactly the size of the straw2 bucket, just as the
146	* item_weights array.
147	*
148	*/
149	struct crush_weight_set {
150	__u32 weights; /!< 16.16 fixed point weights
151	in the same order as items /*
152	__u32 size; /!< size of the __weights__ array /
153	};
154
155	/* @ingroup API*
156	*
157	* Replacement weights and ids for a given straw2 bucket, for
158	* placement purposes.
159	*
160	* When crush_do_rule() chooses the Nth item from a straw2 bucket, the
161	* replacement weights found at __weight_set[N]__ are used instead of
162	* the weights from __item_weights__. If __N__ is greater than
163	* __weight_set_size__, the weights found at __weight_set_size-1__ are
164	* used instead. For instance if __weight_set__ is:
165	*
166	* [ [ 0x10000, 0x20000 ], // position 0
167	* [ 0x20000, 0x40000 ] ] // position 1
168	*
169	* choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
170	* choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
171	* choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
172	* etc.
173	*
174	*/
175	struct crush_choose_arg {
176	__s32 ids; /!< values to use instead of items /*
177	__u32 ids_size; /!< size of the __ids__ array /
178	struct crush_weight_set weight_set; /!< weight replacements for
179	a given position /*
180	__u32 weight_set_size; /!< size of the __weight_set__ array /
181	};
182
183	/* @ingroup API*
184	*
185	* Replacement weights and ids for each bucket in the crushmap. The
186	* __size__ of the __args__ array must be exactly the same as the
187	* __map->max_buckets__.
188	*
189	* The __crush_choose_arg__ at index N will be used when choosing
190	* an item from the bucket __map->buckets[N]__ bucket, provided it
191	* is a straw2 bucket.
192	*
193	*/
194	struct crush_choose_arg_map {
195	#ifdef __KERNEL__
196	struct rb_node node;
197	s64 choose_args_index;
198	#endif
199	struct crush_choose_arg args; /!< replacement for each bucket
200	in the crushmap /*
201	__u32 size; /!< size of the __args__ array /
202	};
203
204	struct crush_bucket_uniform {
205	struct crush_bucket h;
206	__u32 item_weight; / 16-bit fixed point; all items equally weighted /
207	};
208
209	struct crush_bucket_list {
210	struct crush_bucket h;
211	__u32 item_weights; /* 16-bit fixed point /
212	__u32 sum_weights; /* 16-bit fixed point. element i is sum*
213	of weights 0..i, inclusive /*
214	};
215
216	struct crush_bucket_tree {
217	struct crush_bucket h; / note: h.size is _tree_ size, not number of*
218	actual items /*
219	__u8 num_nodes;
220	__u32 *node_weights;
221	};
222
223	struct crush_bucket_straw {
224	struct crush_bucket h;
225	__u32 item_weights; /* 16-bit fixed point /
226	__u32 straws; /* 16-bit fixed point /
227	};
228
229	struct crush_bucket_straw2 {
230	struct crush_bucket h;
231	__u32 item_weights; /* 16-bit fixed point /
232	};
233
234
235
236	/*
237	* CRUSH map includes all buckets, rules, etc.
238	*/
239	struct crush_map {
240	struct crush_bucket **buckets;
241	struct crush_rule **rules;
242
243	__s32 max_buckets;
244	__u32 max_rules;
245	__s32 max_devices;
246
247	/ choose local retries before re-descent /
248	__u32 choose_local_tries;
249	/ choose local attempts using a fallback permutation before*
250	* re-descent */
251	__u32 choose_local_fallback_tries;
252	/ choose attempts before giving up /
253	__u32 choose_total_tries;
254	/ attempt chooseleaf inner descent once for firstn mode; on*
255	* reject retry outer descent. Note that this does not
256	* apply to a collision: in that case we will retry as we used
257	* to. */
258	__u32 chooseleaf_descend_once;
259
260	/ if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)*
261	* bits. a value of 1 is best for new clusters. for legacy clusters
262	* that want to limit reshuffling, a value of 3 or 4 will make the
263	* mappings line up a bit better with previous mappings. */
264	__u8 chooseleaf_vary_r;
265
266	/ if true, it makes chooseleaf firstn to return stable results (if*
267	* no local retry) so that data migrations would be optimal when some
268	* device fails. */
269	__u8 chooseleaf_stable;
270
271	/*
272	* This value is calculated after decode or construction by
273	* the builder. It is exposed here (rather than having a
274	* 'build CRUSH working space' function) so that callers can
275	* reserve a static buffer, allocate space on the stack, or
276	* otherwise avoid calling into the heap allocator if they
277	* want to. The size of the working space depends on the map,
278	* while the size of the scratch vector passed to the mapper
279	* depends on the size of the desired result set.
280	*
281	* Nothing stops the caller from allocating both in one swell
282	* foop and passing in two points, though.
283	*/
284	size_t working_size;
285
286	#ifndef __KERNEL__
287	/*
288	* version 0 (original) of straw_calc has various flaws. version 1
289	* fixes a few of them.
290	*/
291	__u8 straw_calc_version;
292
293	/*
294	* allowed bucket algs is a bitmask, here the bit positions
295	* are CRUSH_BUCKET_. note that these are bits* and
296	* CRUSH_BUCKET_* values are not, so we need to or together (1
297	* << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
298	* minimize confusion (bucket type values start at 1).
299	*/
300	__u32 allowed_bucket_algs;
301
302	__u32 *choose_tries;
303	#else
304	/ device/bucket type id -> type name (CrushWrapper::type_map) /
305	struct rb_root type_names;
306
307	/ device/bucket id -> name (CrushWrapper::name_map) /
308	struct rb_root names;
309
310	/ CrushWrapper::choose_args /
311	struct rb_root choose_args;
312	#endif
313	};
314
315
316	/ crush.c /
317	extern int crush_get_bucket_item_weight(const struct crush_bucket b, int* pos);
318	extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
319	extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
320	extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
321	extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
322	extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
323	extern void crush_destroy_bucket(struct crush_bucket *b);
324	extern void crush_destroy_rule(struct crush_rule *r);
325	extern void crush_destroy(struct crush_map *map);
326
327	static inline int crush_calc_tree_node(int i)
328	{
329	return ((i+`1`) << `1`)-`1`;
330	}
331
332	/*
333	* These data structures are private to the CRUSH implementation. They
334	* are exposed in this header file because builder needs their
335	* definitions to calculate the total working size.
336	*
337	* Moving this out of the crush map allow us to treat the CRUSH map as
338	* immutable within the mapper and removes the requirement for a CRUSH
339	* map lock.
340	*/
341	struct crush_work_bucket {
342	__u32 perm_x; / @x for which perm is defined /*
343	__u32 perm_n; / num elements of perm that are permuted/defined /*
344	__u32 perm; /* Permutation of the bucket's items /
345	};
346
347	struct crush_work {
348	struct crush_work_bucket *work; /* Per-bucket working store /
349	#ifdef __KERNEL__
350	struct list_head item;
351	#endif
352	};
353
354	#ifdef __KERNEL__
355	/ osdmap.c /
356	void clear_crush_names(struct rb_root *root);
357	void clear_choose_args(struct crush_map *c);
358	#endif
359
360	#endif
361

source code of linux/include/linux/crush/crush.h