raid5.c source code [linux/drivers/md/raid5.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* raid5.c : Multiple Devices driver for Linux
4	* Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
5	* Copyright (C) 1999, 2000 Ingo Molnar
6	* Copyright (C) 2002, 2003 H. Peter Anvin
7	*
8	* RAID-4/5/6 management functions.
9	* Thanks to Penguin Computing for making the RAID-6 development possible
10	* by donating a test server!
11	*/
12
13	/*
14	* BITMAP UNPLUGGING:
15	*
16	* The sequencing for updating the bitmap reliably is a little
17	* subtle (and I got it wrong the first time) so it deserves some
18	* explanation.
19	*
20	* We group bitmap updates into batches. Each batch has a number.
21	* We may write out several batches at once, but that isn't very important.
22	* conf->seq_write is the number of the last batch successfully written.
23	* conf->seq_flush is the number of the last batch that was closed to
24	* new additions.
25	* When we discover that we will need to write to any block in a stripe
26	* (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
27	* the number of the batch it will be in. This is seq_flush+1.
28	* When we are ready to do a write, if that batch hasn't been written yet,
29	* we plug the array and queue the stripe for later.
30	* When an unplug happens, we increment bm_flush, thus closing the current
31	* batch.
32	* When we notice that bm_flush > bm_write, we write out all pending updates
33	* to the bitmap, and advance bm_write to where bm_flush was.
34	* This may occasionally write a bit out twice, but is sure never to
35	* miss any bits.
36	*/
37
38	#include <linux/blkdev.h>
39	#include <linux/kthread.h>
40	#include <linux/raid/pq.h>
41	#include <linux/async_tx.h>
42	#include <linux/module.h>
43	#include <linux/async.h>
44	#include <linux/seq_file.h>
45	#include <linux/cpu.h>
46	#include <linux/slab.h>
47	#include <linux/ratelimit.h>
48	#include <linux/nodemask.h>
49
50	#include <trace/events/block.h>
51	#include <linux/list_sort.h>
52
53	#include "md.h"
54	#include "raid5.h"
55	#include "raid0.h"
56	#include "md-bitmap.h"
57	#include "raid5-log.h"
58
59	#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
60
61	#define cpu_to_group(cpu) cpu_to_node(cpu)
62	#define ANY_GROUP NUMA_NO_NODE
63
64	#define RAID5_MAX_REQ_STRIPES 256
65
66	static bool devices_handle_discard_safely = false;
67	module_param(devices_handle_discard_safely, bool, `0644`);
68	MODULE_PARM_DESC(devices_handle_discard_safely,
69	"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
70	static struct workqueue_struct *raid5_wq;
71
72	static void raid5_quiesce(struct mddev mddev, int* quiesce);
73
74	static inline struct hlist_head stripe_hash(struct* r5conf *conf, sector_t sect)
75	{
76	int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
77	return &conf->stripe_hashtbl[hash];
78	}
79
80	static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
81	{
82	return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
83	}
84
85	static inline void lock_device_hash_lock(struct r5conf conf, int* hash)
86	__acquires(&conf->device_lock)
87	{
88	spin_lock_irq(lock: conf->hash_locks + hash);
89	spin_lock(lock: &conf->device_lock);
90	}
91
92	static inline void unlock_device_hash_lock(struct r5conf conf, int* hash)
93	__releases(&conf->device_lock)
94	{
95	spin_unlock(lock: &conf->device_lock);
96	spin_unlock_irq(lock: conf->hash_locks + hash);
97	}
98
99	static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
100	__acquires(&conf->device_lock)
101	{
102	int i;
103	spin_lock_irq(lock: conf->hash_locks);
104	for (i = `1`; i < NR_STRIPE_HASH_LOCKS; i++)
105	spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
106	spin_lock(lock: &conf->device_lock);
107	}
108
109	static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
110	__releases(&conf->device_lock)
111	{
112	int i;
113	spin_unlock(lock: &conf->device_lock);
114	for (i = NR_STRIPE_HASH_LOCKS - `1`; i; i--)
115	spin_unlock(lock: conf->hash_locks + i);
116	spin_unlock_irq(lock: conf->hash_locks);
117	}
118
119	/ Find first data disk in a raid6 stripe /
120	static inline int raid6_d0(struct stripe_head *sh)
121	{
122	if (sh->ddf_layout)
123	/ ddf always start from first device /
124	return `0`;
125	/ md starts just after Q block /
126	if (sh->qd_idx == sh->disks - `1`)
127	return `0`;
128	else
129	return sh->qd_idx + `1`;
130	}
131	static inline int raid6_next_disk(int disk, int raid_disks)
132	{
133	disk++;
134	return (disk < raid_disks) ? disk : `0`;
135	}
136
137	/ When walking through the disks in a raid5, starting at raid6_d0,*
138	* We need to map each disk to a 'slot', where the data disks are slot
139	* 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
140	* is raid_disks-1. This help does that mapping.
141	*/
142	static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
143	int count, int* syndrome_disks)
144	{
145	int slot = *count;
146
147	if (sh->ddf_layout)
148	(*count)++;
149	if (idx == sh->pd_idx)
150	return syndrome_disks;
151	if (idx == sh->qd_idx)
152	return syndrome_disks + `1`;
153	if (!sh->ddf_layout)
154	(*count)++;
155	return slot;
156	}
157
158	static void print_raid5_conf(struct r5conf *conf);
159
160	static int stripe_operations_active(struct stripe_head *sh)
161	{
162	return sh->check_state \|\| sh->reconstruct_state \|\|
163	test_bit(STRIPE_BIOFILL_RUN, &sh->state) \|\|
164	test_bit(STRIPE_COMPUTE_RUN, &sh->state);
165	}
166
167	static bool stripe_is_lowprio(struct stripe_head *sh)
168	{
169	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) \|\|
170	test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
171	!test_bit(STRIPE_R5C_CACHING, &sh->state);
172	}
173
174	static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
175	__must_hold(&sh->raid_conf->device_lock)
176	{
177	struct r5conf *conf = sh->raid_conf;
178	struct r5worker_group *group;
179	int thread_cnt;
180	int i, cpu = sh->cpu;
181
182	if (!cpu_online(cpu)) {
183	cpu = cpumask_any(cpu_online_mask);
184	sh->cpu = cpu;
185	}
186
187	if (list_empty(head: &sh->lru)) {
188	struct r5worker_group *group;
189	group = conf->worker_groups + cpu_to_group(cpu);
190	if (stripe_is_lowprio(sh))
191	list_add_tail(new: &sh->lru, head: &group->loprio_list);
192	else
193	list_add_tail(new: &sh->lru, head: &group->handle_list);
194	group->stripes_cnt++;
195	sh->group = group;
196	}
197
198	if (conf->worker_cnt_per_group == `0`) {
199	md_wakeup_thread(conf->mddev->thread);
200	return;
201	}
202
203	group = conf->worker_groups + cpu_to_group(sh->cpu);
204
205	group->workers[`0`].working = true;
206	/ at least one worker should run to avoid race /
207	queue_work_on(cpu: sh->cpu, wq: raid5_wq, work: &group->workers[`0`].work);
208
209	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - `1`;
210	/ wakeup more workers /
211	for (i = `1`; i < conf->worker_cnt_per_group && thread_cnt > `0`; i++) {
212	if (group->workers[i].working == false) {
213	group->workers[i].working = true;
214	queue_work_on(cpu: sh->cpu, wq: raid5_wq,
215	work: &group->workers[i].work);
216	thread_cnt--;
217	}
218	}
219	}
220
221	static void do_release_stripe(struct r5conf conf, struct* stripe_head *sh,
222	struct list_head *temp_inactive_list)
223	__must_hold(&conf->device_lock)
224	{
225	int i;
226	int injournal = `0`; / number of date pages with R5_InJournal /
227
228	BUG_ON(!list_empty(&sh->lru));
229	BUG_ON(atomic_read(&conf->active_stripes)==`0`);
230
231	if (r5c_is_writeback(log: conf->log))
232	for (i = sh->disks; i--; )
233	if (test_bit(R5_InJournal, &sh->dev[i].flags))
234	injournal++;
235	/*
236	* In the following cases, the stripe cannot be released to cached
237	* lists. Therefore, we make the stripe write out and set
238	* STRIPE_HANDLE:
239	* 1. when quiesce in r5c write back;
240	* 2. when resync is requested fot the stripe.
241	*/
242	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) \|\|
243	(conf->quiesce && r5c_is_writeback(log: conf->log) &&
244	!test_bit(STRIPE_HANDLE, &sh->state) && injournal != `0`)) {
245	if (test_bit(STRIPE_R5C_CACHING, &sh->state))
246	r5c_make_stripe_write_out(sh);
247	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
248	}
249
250	if (test_bit(STRIPE_HANDLE, &sh->state)) {
251	if (test_bit(STRIPE_DELAYED, &sh->state) &&
252	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
253	list_add_tail(new: &sh->lru, head: &conf->delayed_list);
254	else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
255	sh->bm_seq - conf->seq_write > `0`)
256	list_add_tail(new: &sh->lru, head: &conf->bitmap_list);
257	else {
258	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
259	clear_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
260	if (conf->worker_cnt_per_group == `0`) {
261	if (stripe_is_lowprio(sh))
262	list_add_tail(new: &sh->lru,
263	head: &conf->loprio_list);
264	else
265	list_add_tail(new: &sh->lru,
266	head: &conf->handle_list);
267	} else {
268	raid5_wakeup_stripe_thread(sh);
269	return;
270	}
271	}
272	md_wakeup_thread(conf->mddev->thread);
273	} else {
274	BUG_ON(stripe_operations_active(sh));
275	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
276	if (atomic_dec_return(v: &conf->preread_active_stripes)
277	< IO_THRESHOLD)
278	md_wakeup_thread(conf->mddev->thread);
279	atomic_dec(v: &conf->active_stripes);
280	if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
281	if (!r5c_is_writeback(log: conf->log))
282	list_add_tail(new: &sh->lru, head: temp_inactive_list);
283	else {
284	WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
285	if (injournal == `0`)
286	list_add_tail(new: &sh->lru, head: temp_inactive_list);
287	else if (injournal == conf->raid_disks - conf->max_degraded) {
288	/ full stripe /
289	if (!test_and_set_bit(nr: STRIPE_R5C_FULL_STRIPE, addr: &sh->state))
290	atomic_inc(v: &conf->r5c_cached_full_stripes);
291	if (test_and_clear_bit(nr: STRIPE_R5C_PARTIAL_STRIPE, addr: &sh->state))
292	atomic_dec(v: &conf->r5c_cached_partial_stripes);
293	list_add_tail(new: &sh->lru, head: &conf->r5c_full_stripe_list);
294	r5c_check_cached_full_stripe(conf);
295	} else
296	/*
297	* STRIPE_R5C_PARTIAL_STRIPE is set in
298	* r5c_try_caching_write(). No need to
299	* set it again.
300	*/
301	list_add_tail(new: &sh->lru, head: &conf->r5c_partial_stripe_list);
302	}
303	}
304	}
305	}
306
307	static void __release_stripe(struct r5conf conf, struct* stripe_head *sh,
308	struct list_head *temp_inactive_list)
309	__must_hold(&conf->device_lock)
310	{
311	if (atomic_dec_and_test(v: &sh->count))
312	do_release_stripe(conf, sh, temp_inactive_list);
313	}
314
315	/*
316	* @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
317	*
318	* Be careful: Only one task can add/delete stripes from temp_inactive_list at
319	* given time. Adding stripes only takes device lock, while deleting stripes
320	* only takes hash lock.
321	*/
322	static void release_inactive_stripe_list(struct r5conf *conf,
323	struct list_head *temp_inactive_list,
324	int hash)
325	{
326	int size;
327	bool do_wakeup = false;
328	unsigned long flags;
329
330	if (hash == NR_STRIPE_HASH_LOCKS) {
331	size = NR_STRIPE_HASH_LOCKS;
332	hash = NR_STRIPE_HASH_LOCKS - `1`;
333	} else
334	size = `1`;
335	while (size) {
336	struct list_head *list = &temp_inactive_list[size - `1`];
337
338	/*
339	* We don't hold any lock here yet, raid5_get_active_stripe() might
340	* remove stripes from the list
341	*/
342	if (!list_empty_careful(head: list)) {
343	spin_lock_irqsave(conf->hash_locks + hash, flags);
344	if (list_empty(head: conf->inactive_list + hash) &&
345	!list_empty(head: list))
346	atomic_dec(v: &conf->empty_inactive_list_nr);
347	list_splice_tail_init(list, head: conf->inactive_list + hash);
348	do_wakeup = true;
349	spin_unlock_irqrestore(lock: conf->hash_locks + hash, flags);
350	}
351	size--;
352	hash--;
353	}
354
355	if (do_wakeup) {
356	wake_up(&conf->wait_for_stripe);
357	if (atomic_read(v: &conf->active_stripes) == `0`)
358	wake_up(&conf->wait_for_quiescent);
359	if (conf->retry_read_aligned)
360	md_wakeup_thread(conf->mddev->thread);
361	}
362	}
363
364	static int release_stripe_list(struct r5conf *conf,
365	struct list_head *temp_inactive_list)
366	__must_hold(&conf->device_lock)
367	{
368	struct stripe_head sh, t;
369	int count = `0`;
370	struct llist_node *head;
371
372	head = llist_del_all(head: &conf->released_stripes);
373	head = llist_reverse_order(head);
374	llist_for_each_entry_safe(sh, t, head, release_list) {
375	int hash;
376
377	/ sh could be readded after STRIPE_ON_RELEASE_LIST is cleard /
378	smp_mb();
379	clear_bit(nr: STRIPE_ON_RELEASE_LIST, addr: &sh->state);
380	/*
381	* Don't worry the bit is set here, because if the bit is set
382	* again, the count is always > 1. This is true for
383	* STRIPE_ON_UNPLUG_LIST bit too.
384	*/
385	hash = sh->hash_lock_index;
386	__release_stripe(conf, sh, temp_inactive_list: &temp_inactive_list[hash]);
387	count++;
388	}
389
390	return count;
391	}
392
393	void raid5_release_stripe(struct stripe_head *sh)
394	{
395	struct r5conf *conf = sh->raid_conf;
396	unsigned long flags;
397	struct list_head list;
398	int hash;
399	bool wakeup;
400
401	/ Avoid release_list until the last reference.*
402	*/
403	if (atomic_add_unless(v: &sh->count, a: -`1`, u: `1`))
404	return;
405
406	if (unlikely(!conf->mddev->thread) \|\|
407	test_and_set_bit(nr: STRIPE_ON_RELEASE_LIST, addr: &sh->state))
408	goto slow_path;
409	wakeup = llist_add(new: &sh->release_list, head: &conf->released_stripes);
410	if (wakeup)
411	md_wakeup_thread(conf->mddev->thread);
412	return;
413	slow_path:
414	/ we are ok here if STRIPE_ON_RELEASE_LIST is set or not /
415	if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
416	INIT_LIST_HEAD(list: &list);
417	hash = sh->hash_lock_index;
418	do_release_stripe(conf, sh, temp_inactive_list: &list);
419	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
420	release_inactive_stripe_list(conf, temp_inactive_list: &list, hash);
421	}
422	}
423
424	static inline void remove_hash(struct stripe_head *sh)
425	{
426	pr_debug("remove_hash(), stripe %llu\n",
427	(unsigned long long)sh->sector);
428
429	hlist_del_init(n: &sh->hash);
430	}
431
432	static inline void insert_hash(struct r5conf conf, struct* stripe_head *sh)
433	{
434	struct hlist_head *hp = stripe_hash(conf, sect: sh->sector);
435
436	pr_debug("insert_hash(), stripe %llu\n",
437	(unsigned long long)sh->sector);
438
439	hlist_add_head(n: &sh->hash, h: hp);
440	}
441
442	/ find an idle stripe, make sure it is unhashed, and return it. /
443	static struct stripe_head get_free_stripe(struct* r5conf conf, int* hash)
444	{
445	struct stripe_head *sh = NULL;
446	struct list_head *first;
447
448	if (list_empty(head: conf->inactive_list + hash))
449	goto out;
450	first = (conf->inactive_list + hash)->next;
451	sh = list_entry(first, struct stripe_head, lru);
452	list_del_init(entry: first);
453	remove_hash(sh);
454	atomic_inc(v: &conf->active_stripes);
455	BUG_ON(hash != sh->hash_lock_index);
456	if (list_empty(head: conf->inactive_list + hash))
457	atomic_inc(v: &conf->empty_inactive_list_nr);
458	out:
459	return sh;
460	}
461
462	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
463	static void free_stripe_pages(struct stripe_head *sh)
464	{
465	int i;
466	struct page *p;
467
468	/ Have not allocate page pool /
469	if (!sh->pages)
470	return;
471
472	for (i = `0`; i < sh->nr_pages; i++) {
473	p = sh->pages[i];
474	if (p)
475	put_page(p);
476	sh->pages[i] = NULL;
477	}
478	}
479
480	static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
481	{
482	int i;
483	struct page *p;
484
485	for (i = `0`; i < sh->nr_pages; i++) {
486	/ The page have allocated. /
487	if (sh->pages[i])
488	continue;
489
490	p = alloc_page(gfp);
491	if (!p) {
492	free_stripe_pages(sh);
493	return -ENOMEM;
494	}
495	sh->pages[i] = p;
496	}
497	return `0`;
498	}
499
500	static int
501	init_stripe_shared_pages(struct stripe_head sh, struct* r5conf conf, int* disks)
502	{
503	int nr_pages, cnt;
504
505	if (sh->pages)
506	return `0`;
507
508	/ Each of the sh->dev[i] need one conf->stripe_size /
509	cnt = PAGE_SIZE / conf->stripe_size;
510	nr_pages = (disks + cnt - `1`) / cnt;
511
512	sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
513	if (!sh->pages)
514	return -ENOMEM;
515	sh->nr_pages = nr_pages;
516	sh->stripes_per_page = cnt;
517	return `0`;
518	}
519	#endif
520
521	static void shrink_buffers(struct stripe_head *sh)
522	{
523	int i;
524	int num = sh->raid_conf->pool_size;
525
526	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
527	for (i = `0`; i < num ; i++) {
528	struct page *p;
529
530	WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
531	p = sh->dev[i].page;
532	if (!p)
533	continue;
534	sh->dev[i].page = NULL;
535	put_page(page: p);
536	}
537	#else
538	for (i = `0`; i < num; i++)
539	sh->dev[i].page = NULL;
540	free_stripe_pages(sh); / Free pages /
541	#endif
542	}
543
544	static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
545	{
546	int i;
547	int num = sh->raid_conf->pool_size;
548
549	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
550	for (i = `0`; i < num; i++) {
551	struct page *page;
552
553	if (!(page = alloc_page(gfp))) {
554	return `1`;
555	}
556	sh->dev[i].page = page;
557	sh->dev[i].orig_page = page;
558	sh->dev[i].offset = `0`;
559	}
560	#else
561	if (alloc_stripe_pages(sh, gfp))
562	return -ENOMEM;
563
564	for (i = `0`; i < num; i++) {
565	sh->dev[i].page = raid5_get_dev_page(sh, i);
566	sh->dev[i].orig_page = sh->dev[i].page;
567	sh->dev[i].offset = raid5_get_page_offset(sh, i);
568	}
569	#endif
570	return `0`;
571	}
572
573	static void stripe_set_idx(sector_t stripe, struct r5conf conf, int* previous,
574	struct stripe_head *sh);
575
576	static void init_stripe(struct stripe_head sh, sector_t sector, int* previous)
577	{
578	struct r5conf *conf = sh->raid_conf;
579	int i, seq;
580
581	BUG_ON(atomic_read(&sh->count) != `0`);
582	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
583	BUG_ON(stripe_operations_active(sh));
584	BUG_ON(sh->batch_head);
585
586	pr_debug("init_stripe called, stripe %llu\n",
587	(unsigned long long)sector);
588	retry:
589	seq = read_seqcount_begin(&conf->gen_lock);
590	sh->generation = conf->generation - previous;
591	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
592	sh->sector = sector;
593	stripe_set_idx(stripe: sector, conf, previous, sh);
594	sh->state = `0`;
595
596	for (i = sh->disks; i--; ) {
597	struct r5dev *dev = &sh->dev[i];
598
599	if (dev->toread \|\| dev->read \|\| dev->towrite \|\| dev->written \|\|
600	test_bit(R5_LOCKED, &dev->flags)) {
601	pr_err("sector=%llx i=%d %p %p %p %p %d\n",
602	(unsigned long long)sh->sector, i, dev->toread,
603	dev->read, dev->towrite, dev->written,
604	test_bit(R5_LOCKED, &dev->flags));
605	WARN_ON(`1`);
606	}
607	dev->flags = `0`;
608	dev->sector = raid5_compute_blocknr(sh, i, previous);
609	}
610	if (read_seqcount_retry(&conf->gen_lock, seq))
611	goto retry;
612	sh->overwrite_disks = `0`;
613	insert_hash(conf, sh);
614	sh->cpu = smp_processor_id();
615	set_bit(nr: STRIPE_BATCH_READY, addr: &sh->state);
616	}
617
618	static struct stripe_head __find_stripe(struct* r5conf *conf, sector_t sector,
619	short generation)
620	{
621	struct stripe_head *sh;
622
623	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
624	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
625	if (sh->sector == sector && sh->generation == generation)
626	return sh;
627	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
628	return NULL;
629	}
630
631	static struct stripe_head find_get_stripe(struct* r5conf *conf,
632	sector_t sector, short generation, int hash)
633	{
634	int inc_empty_inactive_list_flag;
635	struct stripe_head *sh;
636
637	sh = __find_stripe(conf, sector, generation);
638	if (!sh)
639	return NULL;
640
641	if (atomic_inc_not_zero(v: &sh->count))
642	return sh;
643
644	/*
645	* Slow path. The reference count is zero which means the stripe must
646	* be on a list (sh->lru). Must remove the stripe from the list that
647	* references it with the device_lock held.
648	*/
649
650	spin_lock(lock: &conf->device_lock);
651	if (!atomic_read(v: &sh->count)) {
652	if (!test_bit(STRIPE_HANDLE, &sh->state))
653	atomic_inc(v: &conf->active_stripes);
654	BUG_ON(list_empty(&sh->lru) &&
655	!test_bit(STRIPE_EXPANDING, &sh->state));
656	inc_empty_inactive_list_flag = `0`;
657	if (!list_empty(head: conf->inactive_list + hash))
658	inc_empty_inactive_list_flag = `1`;
659	list_del_init(entry: &sh->lru);
660	if (list_empty(head: conf->inactive_list + hash) &&
661	inc_empty_inactive_list_flag)
662	atomic_inc(v: &conf->empty_inactive_list_nr);
663	if (sh->group) {
664	sh->group->stripes_cnt--;
665	sh->group = NULL;
666	}
667	}
668	atomic_inc(v: &sh->count);
669	spin_unlock(lock: &conf->device_lock);
670
671	return sh;
672	}
673
674	/*
675	* Need to check if array has failed when deciding whether to:
676	* - start an array
677	* - remove non-faulty devices
678	* - add a spare
679	* - allow a reshape
680	* This determination is simple when no reshape is happening.
681	* However if there is a reshape, we need to carefully check
682	* both the before and after sections.
683	* This is because some failed devices may only affect one
684	* of the two sections, and some non-in_sync devices may
685	* be insync in the section most affected by failed devices.
686	*
687	* Most calls to this function hold &conf->device_lock. Calls
688	* in raid5_run() do not require the lock as no other threads
689	* have been started yet.
690	*/
691	int raid5_calc_degraded(struct r5conf *conf)
692	{
693	int degraded, degraded2;
694	int i;
695
696	degraded = `0`;
697	for (i = `0`; i < conf->previous_raid_disks; i++) {
698	struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
699
700	if (rdev && test_bit(Faulty, &rdev->flags))
701	rdev = READ_ONCE(conf->disks[i].replacement);
702	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
703	degraded++;
704	else if (test_bit(In_sync, &rdev->flags))
705	;
706	else
707	/ not in-sync or faulty.*
708	* If the reshape increases the number of devices,
709	* this is being recovered by the reshape, so
710	* this 'previous' section is not in_sync.
711	* If the number of devices is being reduced however,
712	* the device can only be part of the array if
713	* we are reverting a reshape, so this section will
714	* be in-sync.
715	*/
716	if (conf->raid_disks >= conf->previous_raid_disks)
717	degraded++;
718	}
719	if (conf->raid_disks == conf->previous_raid_disks)
720	return degraded;
721	degraded2 = `0`;
722	for (i = `0`; i < conf->raid_disks; i++) {
723	struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
724
725	if (rdev && test_bit(Faulty, &rdev->flags))
726	rdev = READ_ONCE(conf->disks[i].replacement);
727	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
728	degraded2++;
729	else if (test_bit(In_sync, &rdev->flags))
730	;
731	else
732	/ not in-sync or faulty.*
733	* If reshape increases the number of devices, this
734	* section has already been recovered, else it
735	* almost certainly hasn't.
736	*/
737	if (conf->raid_disks <= conf->previous_raid_disks)
738	degraded2++;
739	}
740	if (degraded2 > degraded)
741	return degraded2;
742	return degraded;
743	}
744
745	static bool has_failed(struct r5conf *conf)
746	{
747	int degraded = conf->mddev->degraded;
748
749	if (test_bit(MD_BROKEN, &conf->mddev->flags))
750	return true;
751
752	if (conf->mddev->reshape_position != MaxSector)
753	degraded = raid5_calc_degraded(conf);
754
755	return degraded > conf->max_degraded;
756	}
757
758	enum stripe_result {
759	STRIPE_SUCCESS = `0`,
760	STRIPE_RETRY,
761	STRIPE_SCHEDULE_AND_RETRY,
762	STRIPE_FAIL,
763	STRIPE_WAIT_RESHAPE,
764	};
765
766	struct stripe_request_ctx {
767	/ a reference to the last stripe_head for batching /
768	struct stripe_head *batch_last;
769
770	/ first sector in the request /
771	sector_t first_sector;
772
773	/ last sector in the request /
774	sector_t last_sector;
775
776	/*
777	* bitmap to track stripe sectors that have been added to stripes
778	* add one to account for unaligned requests
779	*/
780	DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + `1`);
781
782	/ the request had REQ_PREFLUSH, cleared after the first stripe_head /
783	bool do_flush;
784	};
785
786	/*
787	* Block until another thread clears R5_INACTIVE_BLOCKED or
788	* there are fewer than 3/4 the maximum number of active stripes
789	* and there is an inactive stripe available.
790	*/
791	static bool is_inactive_blocked(struct r5conf conf, int* hash)
792	{
793	if (list_empty(head: conf->inactive_list + hash))
794	return false;
795
796	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
797	return true;
798
799	return (atomic_read(v: &conf->active_stripes) <
800	(conf->max_nr_stripes * `3` / `4`));
801	}
802
803	struct stripe_head raid5_get_active_stripe(struct* r5conf *conf,
804	struct stripe_request_ctx *ctx, sector_t sector,
805	unsigned int flags)
806	{
807	struct stripe_head *sh;
808	int hash = stripe_hash_locks_hash(conf, sect: sector);
809	int previous = !!(flags & R5_GAS_PREVIOUS);
810
811	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
812
813	spin_lock_irq(lock: conf->hash_locks + hash);
814
815	for (;;) {
816	if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) {
817	/*
818	* Must release the reference to batch_last before
819	* waiting, on quiesce, otherwise the batch_last will
820	* hold a reference to a stripe and raid5_quiesce()
821	* will deadlock waiting for active_stripes to go to
822	* zero.
823	*/
824	if (ctx && ctx->batch_last) {
825	raid5_release_stripe(sh: ctx->batch_last);
826	ctx->batch_last = NULL;
827	}
828
829	wait_event_lock_irq(conf->wait_for_quiescent,
830	!conf->quiesce,
831	*(conf->hash_locks + hash));
832	}
833
834	sh = find_get_stripe(conf, sector, generation: conf->generation - previous,
835	hash);
836	if (sh)
837	break;
838
839	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
840	sh = get_free_stripe(conf, hash);
841	if (sh) {
842	r5c_check_stripe_cache_usage(conf);
843	init_stripe(sh, sector, previous);
844	atomic_inc(v: &sh->count);
845	break;
846	}
847
848	if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
849	set_bit(nr: R5_ALLOC_MORE, addr: &conf->cache_state);
850	}
851
852	if (flags & R5_GAS_NOBLOCK)
853	break;
854
855	set_bit(nr: R5_INACTIVE_BLOCKED, addr: &conf->cache_state);
856	r5l_wake_reclaim(log: conf->log, space: `0`);
857
858	/ release batch_last before wait to avoid risk of deadlock /
859	if (ctx && ctx->batch_last) {
860	raid5_release_stripe(sh: ctx->batch_last);
861	ctx->batch_last = NULL;
862	}
863
864	wait_event_lock_irq(conf->wait_for_stripe,
865	is_inactive_blocked(conf, hash),
866	*(conf->hash_locks + hash));
867	clear_bit(nr: R5_INACTIVE_BLOCKED, addr: &conf->cache_state);
868	}
869
870	spin_unlock_irq(lock: conf->hash_locks + hash);
871	return sh;
872	}
873
874	static bool is_full_stripe_write(struct stripe_head *sh)
875	{
876	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
877	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
878	}
879
880	static void lock_two_stripes(struct stripe_head sh1, struct* stripe_head *sh2)
881	__acquires(&sh1->stripe_lock)
882	__acquires(&sh2->stripe_lock)
883	{
884	if (sh1 > sh2) {
885	spin_lock_irq(lock: &sh2->stripe_lock);
886	spin_lock_nested(&sh1->stripe_lock, `1`);
887	} else {
888	spin_lock_irq(lock: &sh1->stripe_lock);
889	spin_lock_nested(&sh2->stripe_lock, `1`);
890	}
891	}
892
893	static void unlock_two_stripes(struct stripe_head sh1, struct* stripe_head *sh2)
894	__releases(&sh1->stripe_lock)
895	__releases(&sh2->stripe_lock)
896	{
897	spin_unlock(lock: &sh1->stripe_lock);
898	spin_unlock_irq(lock: &sh2->stripe_lock);
899	}
900
901	/ Only freshly new full stripe normal write stripe can be added to a batch list /
902	static bool stripe_can_batch(struct stripe_head *sh)
903	{
904	struct r5conf *conf = sh->raid_conf;
905
906	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
907	return false;
908	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
909	is_full_stripe_write(sh);
910	}
911
912	/ we only do back search /
913	static void stripe_add_to_batch_list(struct r5conf *conf,
914	struct stripe_head sh, struct* stripe_head *last_sh)
915	{
916	struct stripe_head *head;
917	sector_t head_sector, tmp_sec;
918	int hash;
919	int dd_idx;
920
921	/ Don't cross chunks, so stripe pd_idx/qd_idx is the same /
922	tmp_sec = sh->sector;
923	if (!sector_div(tmp_sec, conf->chunk_sectors))
924	return;
925	head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
926
927	if (last_sh && head_sector == last_sh->sector) {
928	head = last_sh;
929	atomic_inc(v: &head->count);
930	} else {
931	hash = stripe_hash_locks_hash(conf, sect: head_sector);
932	spin_lock_irq(lock: conf->hash_locks + hash);
933	head = find_get_stripe(conf, sector: head_sector, generation: conf->generation,
934	hash);
935	spin_unlock_irq(lock: conf->hash_locks + hash);
936	if (!head)
937	return;
938	if (!stripe_can_batch(sh: head))
939	goto out;
940	}
941
942	lock_two_stripes(sh1: head, sh2: sh);
943	/ clear_batch_ready clear the flag /
944	if (!stripe_can_batch(sh: head) \|\| !stripe_can_batch(sh))
945	goto unlock_out;
946
947	if (sh->batch_head)
948	goto unlock_out;
949
950	dd_idx = `0`;
951	while (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
952	dd_idx++;
953	if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf \|\|
954	bio_op(bio: head->dev[dd_idx].towrite) != bio_op(bio: sh->dev[dd_idx].towrite))
955	goto unlock_out;
956
957	if (head->batch_head) {
958	spin_lock(lock: &head->batch_head->batch_lock);
959	/ This batch list is already running /
960	if (!stripe_can_batch(sh: head)) {
961	spin_unlock(lock: &head->batch_head->batch_lock);
962	goto unlock_out;
963	}
964	/*
965	* We must assign batch_head of this stripe within the
966	* batch_lock, otherwise clear_batch_ready of batch head
967	* stripe could clear BATCH_READY bit of this stripe and
968	* this stripe->batch_head doesn't get assigned, which
969	* could confuse clear_batch_ready for this stripe
970	*/
971	sh->batch_head = head->batch_head;
972
973	/*
974	* at this point, head's BATCH_READY could be cleared, but we
975	* can still add the stripe to batch list
976	*/
977	list_add(new: &sh->batch_list, head: &head->batch_list);
978	spin_unlock(lock: &head->batch_head->batch_lock);
979	} else {
980	head->batch_head = head;
981	sh->batch_head = head->batch_head;
982	spin_lock(lock: &head->batch_lock);
983	list_add_tail(new: &sh->batch_list, head: &head->batch_list);
984	spin_unlock(lock: &head->batch_lock);
985	}
986
987	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
988	if (atomic_dec_return(v: &conf->preread_active_stripes)
989	< IO_THRESHOLD)
990	md_wakeup_thread(conf->mddev->thread);
991
992	if (test_and_clear_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state)) {
993	int seq = sh->bm_seq;
994	if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
995	sh->batch_head->bm_seq > seq)
996	seq = sh->batch_head->bm_seq;
997	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->batch_head->state);
998	sh->batch_head->bm_seq = seq;
999	}
1000
1001	atomic_inc(v: &sh->count);
1002	unlock_out:
1003	unlock_two_stripes(sh1: head, sh2: sh);
1004	out:
1005	raid5_release_stripe(sh: head);
1006	}
1007
1008	/ Determine if 'data_offset' or 'new_data_offset' should be used*
1009	* in this stripe_head.
1010	*/
1011	static int use_new_offset(struct r5conf conf, struct* stripe_head *sh)
1012	{
1013	sector_t progress = conf->reshape_progress;
1014	/ Need a memory barrier to make sure we see the value*
1015	* of conf->generation, or ->data_offset that was set before
1016	* reshape_progress was updated.
1017	*/
1018	smp_rmb();
1019	if (progress == MaxSector)
1020	return `0`;
1021	if (sh->generation == conf->generation - `1`)
1022	return `0`;
1023	/ We are in a reshape, and this is a new-generation stripe,*
1024	* so use new_data_offset.
1025	*/
1026	return `1`;
1027	}
1028
1029	static void dispatch_bio_list(struct bio_list *tmp)
1030	{
1031	struct bio *bio;
1032
1033	while ((bio = bio_list_pop(bl: tmp)))
1034	submit_bio_noacct(bio);
1035	}
1036
1037	static int cmp_stripe(void priv, const* struct list_head *a,
1038	const struct list_head *b)
1039	{
1040	const struct r5pending_data *da = list_entry(a,
1041	struct r5pending_data, sibling);
1042	const struct r5pending_data *db = list_entry(b,
1043	struct r5pending_data, sibling);
1044	if (da->sector > db->sector)
1045	return `1`;
1046	if (da->sector < db->sector)
1047	return -`1`;
1048	return `0`;
1049	}
1050
1051	static void dispatch_defer_bios(struct r5conf conf, int* target,
1052	struct bio_list *list)
1053	{
1054	struct r5pending_data *data;
1055	struct list_head first, next = NULL;
1056	int cnt = `0`;
1057
1058	if (conf->pending_data_cnt == `0`)
1059	return;
1060
1061	list_sort(NULL, head: &conf->pending_list, cmp: cmp_stripe);
1062
1063	first = conf->pending_list.next;
1064
1065	/ temporarily move the head /
1066	if (conf->next_pending_data)
1067	list_move_tail(list: &conf->pending_list,
1068	head: &conf->next_pending_data->sibling);
1069
1070	while (!list_empty(head: &conf->pending_list)) {
1071	data = list_first_entry(&conf->pending_list,
1072	struct r5pending_data, sibling);
1073	if (&data->sibling == first)
1074	first = data->sibling.next;
1075	next = data->sibling.next;
1076
1077	bio_list_merge(bl: list, bl2: &data->bios);
1078	list_move(list: &data->sibling, head: &conf->free_list);
1079	cnt++;
1080	if (cnt >= target)
1081	break;
1082	}
1083	conf->pending_data_cnt -= cnt;
1084	BUG_ON(conf->pending_data_cnt < `0` \|\| cnt < target);
1085
1086	if (next != &conf->pending_list)
1087	conf->next_pending_data = list_entry(next,
1088	struct r5pending_data, sibling);
1089	else
1090	conf->next_pending_data = NULL;
1091	/ list isn't empty /
1092	if (first != &conf->pending_list)
1093	list_move_tail(list: &conf->pending_list, head: first);
1094	}
1095
1096	static void flush_deferred_bios(struct r5conf *conf)
1097	{
1098	struct bio_list tmp = BIO_EMPTY_LIST;
1099
1100	if (conf->pending_data_cnt == `0`)
1101	return;
1102
1103	spin_lock(lock: &conf->pending_bios_lock);
1104	dispatch_defer_bios(conf, target: conf->pending_data_cnt, list: &tmp);
1105	BUG_ON(conf->pending_data_cnt != `0`);
1106	spin_unlock(lock: &conf->pending_bios_lock);
1107
1108	dispatch_bio_list(tmp: &tmp);
1109	}
1110
1111	static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1112	struct bio_list *bios)
1113	{
1114	struct bio_list tmp = BIO_EMPTY_LIST;
1115	struct r5pending_data *ent;
1116
1117	spin_lock(lock: &conf->pending_bios_lock);
1118	ent = list_first_entry(&conf->free_list, struct r5pending_data,
1119	sibling);
1120	list_move_tail(list: &ent->sibling, head: &conf->pending_list);
1121	ent->sector = sector;
1122	bio_list_init(bl: &ent->bios);
1123	bio_list_merge(bl: &ent->bios, bl2: bios);
1124	conf->pending_data_cnt++;
1125	if (conf->pending_data_cnt >= PENDING_IO_MAX)
1126	dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, list: &tmp);
1127
1128	spin_unlock(lock: &conf->pending_bios_lock);
1129
1130	dispatch_bio_list(tmp: &tmp);
1131	}
1132
1133	static void
1134	raid5_end_read_request(struct bio *bi);
1135	static void
1136	raid5_end_write_request(struct bio *bi);
1137
1138	static void ops_run_io(struct stripe_head sh, struct* stripe_head_state *s)
1139	{
1140	struct r5conf *conf = sh->raid_conf;
1141	int i, disks = sh->disks;
1142	struct stripe_head *head_sh = sh;
1143	struct bio_list pending_bios = BIO_EMPTY_LIST;
1144	struct r5dev *dev;
1145	bool should_defer;
1146
1147	might_sleep();
1148
1149	if (log_stripe(sh, s) == `0`)
1150	return;
1151
1152	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1153
1154	for (i = disks; i--; ) {
1155	enum req_op op;
1156	blk_opf_t op_flags = `0`;
1157	int replace_only = `0`;
1158	struct bio bi, rbi;
1159	struct md_rdev rdev, rrdev = NULL;
1160
1161	sh = head_sh;
1162	if (test_and_clear_bit(nr: R5_Wantwrite, addr: &sh->dev[i].flags)) {
1163	op = REQ_OP_WRITE;
1164	if (test_and_clear_bit(nr: R5_WantFUA, addr: &sh->dev[i].flags))
1165	op_flags = REQ_FUA;
1166	if (test_bit(R5_Discard, &sh->dev[i].flags))
1167	op = REQ_OP_DISCARD;
1168	} else if (test_and_clear_bit(nr: R5_Wantread, addr: &sh->dev[i].flags))
1169	op = REQ_OP_READ;
1170	else if (test_and_clear_bit(nr: R5_WantReplace,
1171	addr: &sh->dev[i].flags)) {
1172	op = REQ_OP_WRITE;
1173	replace_only = `1`;
1174	} else
1175	continue;
1176	if (test_and_clear_bit(nr: R5_SyncIO, addr: &sh->dev[i].flags))
1177	op_flags \|= REQ_SYNC;
1178
1179	again:
1180	dev = &sh->dev[i];
1181	bi = &dev->req;
1182	rbi = &dev->rreq; / For writing to replacement /
1183
1184	rdev = conf->disks[i].rdev;
1185	rrdev = conf->disks[i].replacement;
1186	if (op_is_write(op)) {
1187	if (replace_only)
1188	rdev = NULL;
1189	if (rdev == rrdev)
1190	/ We raced and saw duplicates /
1191	rrdev = NULL;
1192	} else {
1193	if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1194	rdev = rrdev;
1195	rrdev = NULL;
1196	}
1197
1198	if (rdev && test_bit(Faulty, &rdev->flags))
1199	rdev = NULL;
1200	if (rdev)
1201	atomic_inc(v: &rdev->nr_pending);
1202	if (rrdev && test_bit(Faulty, &rrdev->flags))
1203	rrdev = NULL;
1204	if (rrdev)
1205	atomic_inc(v: &rrdev->nr_pending);
1206
1207	/ We have already checked bad blocks for reads. Now*
1208	* need to check for writes. We never accept write errors
1209	* on the replacement, so we don't to check rrdev.
1210	*/
1211	while (op_is_write(op) && rdev &&
1212	test_bit(WriteErrorSeen, &rdev->flags)) {
1213	int bad = rdev_has_badblock(rdev, s: sh->sector,
1214	RAID5_STRIPE_SECTORS(conf));
1215	if (!bad)
1216	break;
1217
1218	if (bad < `0`) {
1219	set_bit(nr: BlockedBadBlocks, addr: &rdev->flags);
1220	if (!conf->mddev->external &&
1221	conf->mddev->sb_flags) {
1222	/ It is very unlikely, but we might*
1223	* still need to write out the
1224	* bad block log - better give it
1225	* a chance*/
1226	md_check_recovery(mddev: conf->mddev);
1227	}
1228	/*
1229	* Because md_wait_for_blocked_rdev
1230	* will dec nr_pending, we must
1231	* increment it first.
1232	*/
1233	atomic_inc(v: &rdev->nr_pending);
1234	md_wait_for_blocked_rdev(rdev, mddev: conf->mddev);
1235	} else {
1236	/ Acknowledged bad block - skip the write /
1237	rdev_dec_pending(rdev, mddev: conf->mddev);
1238	rdev = NULL;
1239	}
1240	}
1241
1242	if (rdev) {
1243	set_bit(nr: STRIPE_IO_STARTED, addr: &sh->state);
1244
1245	bio_init(bio: bi, bdev: rdev->bdev, table: &dev->vec, max_vecs: `1`, opf: op \| op_flags);
1246	bi->bi_end_io = op_is_write(op)
1247	? raid5_end_write_request
1248	: raid5_end_read_request;
1249	bi->bi_private = sh;
1250
1251	pr_debug("%s: for %llu schedule op %d on disc %d\n",
1252	__func__, (unsigned long long)sh->sector,
1253	bi->bi_opf, i);
1254	atomic_inc(v: &sh->count);
1255	if (sh != head_sh)
1256	atomic_inc(v: &head_sh->count);
1257	if (use_new_offset(conf, sh))
1258	bi->bi_iter.bi_sector = (sh->sector
1259	+ rdev->new_data_offset);
1260	else
1261	bi->bi_iter.bi_sector = (sh->sector
1262	+ rdev->data_offset);
1263	if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1264	bi->bi_opf \|= REQ_NOMERGE;
1265
1266	if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1267	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1268
1269	if (!op_is_write(op) &&
1270	test_bit(R5_InJournal, &sh->dev[i].flags))
1271	/*
1272	* issuing read for a page in journal, this
1273	* must be preparing for prexor in rmw; read
1274	* the data into orig_page
1275	*/
1276	sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1277	else
1278	sh->dev[i].vec.bv_page = sh->dev[i].page;
1279	bi->bi_vcnt = `1`;
1280	bi->bi_io_vec[`0`].bv_len = RAID5_STRIPE_SIZE(conf);
1281	bi->bi_io_vec[`0`].bv_offset = sh->dev[i].offset;
1282	bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1283	/*
1284	* If this is discard request, set bi_vcnt 0. We don't
1285	* want to confuse SCSI because SCSI will replace payload
1286	*/
1287	if (op == REQ_OP_DISCARD)
1288	bi->bi_vcnt = `0`;
1289	if (rrdev)
1290	set_bit(nr: R5_DOUBLE_LOCKED, addr: &sh->dev[i].flags);
1291
1292	mddev_trace_remap(mddev: conf->mddev, bio: bi, sector: sh->dev[i].sector);
1293	if (should_defer && op_is_write(op))
1294	bio_list_add(bl: &pending_bios, bio: bi);
1295	else
1296	submit_bio_noacct(bio: bi);
1297	}
1298	if (rrdev) {
1299	set_bit(nr: STRIPE_IO_STARTED, addr: &sh->state);
1300
1301	bio_init(bio: rbi, bdev: rrdev->bdev, table: &dev->rvec, max_vecs: `1`, opf: op \| op_flags);
1302	BUG_ON(!op_is_write(op));
1303	rbi->bi_end_io = raid5_end_write_request;
1304	rbi->bi_private = sh;
1305
1306	pr_debug("%s: for %llu schedule op %d on "
1307	"replacement disc %d\n",
1308	__func__, (unsigned long long)sh->sector,
1309	rbi->bi_opf, i);
1310	atomic_inc(v: &sh->count);
1311	if (sh != head_sh)
1312	atomic_inc(v: &head_sh->count);
1313	if (use_new_offset(conf, sh))
1314	rbi->bi_iter.bi_sector = (sh->sector
1315	+ rrdev->new_data_offset);
1316	else
1317	rbi->bi_iter.bi_sector = (sh->sector
1318	+ rrdev->data_offset);
1319	if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1320	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1321	sh->dev[i].rvec.bv_page = sh->dev[i].page;
1322	rbi->bi_vcnt = `1`;
1323	rbi->bi_io_vec[`0`].bv_len = RAID5_STRIPE_SIZE(conf);
1324	rbi->bi_io_vec[`0`].bv_offset = sh->dev[i].offset;
1325	rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1326	/*
1327	* If this is discard request, set bi_vcnt 0. We don't
1328	* want to confuse SCSI because SCSI will replace payload
1329	*/
1330	if (op == REQ_OP_DISCARD)
1331	rbi->bi_vcnt = `0`;
1332	mddev_trace_remap(mddev: conf->mddev, bio: rbi, sector: sh->dev[i].sector);
1333	if (should_defer && op_is_write(op))
1334	bio_list_add(bl: &pending_bios, bio: rbi);
1335	else
1336	submit_bio_noacct(bio: rbi);
1337	}
1338	if (!rdev && !rrdev) {
1339	pr_debug("skip op %d on disc %d for sector %llu\n",
1340	bi->bi_opf, i, (unsigned long long)sh->sector);
1341	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
1342	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1343	}
1344
1345	if (!head_sh->batch_head)
1346	continue;
1347	sh = list_first_entry(&sh->batch_list, struct stripe_head,
1348	batch_list);
1349	if (sh != head_sh)
1350	goto again;
1351	}
1352
1353	if (should_defer && !bio_list_empty(bl: &pending_bios))
1354	defer_issue_bios(conf, sector: head_sh->sector, bios: &pending_bios);
1355	}
1356
1357	static struct dma_async_tx_descriptor *
1358	async_copy_data(int frombio, struct bio bio, struct* page **page,
1359	unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1360	struct stripe_head sh, int* no_skipcopy)
1361	{
1362	struct bio_vec bvl;
1363	struct bvec_iter iter;
1364	struct page *bio_page;
1365	int page_offset;
1366	struct async_submit_ctl submit;
1367	enum async_tx_flags flags = `0`;
1368	struct r5conf *conf = sh->raid_conf;
1369
1370	if (bio->bi_iter.bi_sector >= sector)
1371	page_offset = (signed)(bio->bi_iter.bi_sector - sector) * `512`;
1372	else
1373	page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -`512`;
1374
1375	if (frombio)
1376	flags \|= ASYNC_TX_FENCE;
1377	init_async_submit(args: &submit, flags, tx, NULL, NULL, NULL);
1378
1379	bio_for_each_segment(bvl, bio, iter) {
1380	int len = bvl.bv_len;
1381	int clen;
1382	int b_offset = `0`;
1383
1384	if (page_offset < `0`) {
1385	b_offset = -page_offset;
1386	page_offset += b_offset;
1387	len -= b_offset;
1388	}
1389
1390	if (len > `0` && page_offset + len > RAID5_STRIPE_SIZE(conf))
1391	clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1392	else
1393	clen = len;
1394
1395	if (clen > `0`) {
1396	b_offset += bvl.bv_offset;
1397	bio_page = bvl.bv_page;
1398	if (frombio) {
1399	if (conf->skip_copy &&
1400	b_offset == `0` && page_offset == `0` &&
1401	clen == RAID5_STRIPE_SIZE(conf) &&
1402	!no_skipcopy)
1403	*page = bio_page;
1404	else
1405	tx = async_memcpy(dest: *page, src: bio_page, dest_offset: page_offset + poff,
1406	src_offset: b_offset, len: clen, submit: &submit);
1407	} else
1408	tx = async_memcpy(dest: bio_page, src: *page, dest_offset: b_offset,
1409	src_offset: page_offset + poff, len: clen, submit: &submit);
1410	}
1411	/ chain the operations /
1412	submit.depend_tx = tx;
1413
1414	if (clen < len) / hit end of page /
1415	break;
1416	page_offset += len;
1417	}
1418
1419	return tx;
1420	}
1421
1422	static void ops_complete_biofill(void *stripe_head_ref)
1423	{
1424	struct stripe_head *sh = stripe_head_ref;
1425	int i;
1426	struct r5conf *conf = sh->raid_conf;
1427
1428	pr_debug("%s: stripe %llu\n", __func__,
1429	(unsigned long long)sh->sector);
1430
1431	/ clear completed biofills /
1432	for (i = sh->disks; i--; ) {
1433	struct r5dev *dev = &sh->dev[i];
1434
1435	/ acknowledge completion of a biofill operation /
1436	/ and check if we need to reply to a read request,*
1437	* new R5_Wantfill requests are held off until
1438	* !STRIPE_BIOFILL_RUN
1439	*/
1440	if (test_and_clear_bit(nr: R5_Wantfill, addr: &dev->flags)) {
1441	struct bio rbi, rbi2;
1442
1443	BUG_ON(!dev->read);
1444	rbi = dev->read;
1445	dev->read = NULL;
1446	while (rbi && rbi->bi_iter.bi_sector <
1447	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1448	rbi2 = r5_next_bio(conf, bio: rbi, sector: dev->sector);
1449	bio_endio(rbi);
1450	rbi = rbi2;
1451	}
1452	}
1453	}
1454	clear_bit(nr: STRIPE_BIOFILL_RUN, addr: &sh->state);
1455
1456	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1457	raid5_release_stripe(sh);
1458	}
1459
1460	static void ops_run_biofill(struct stripe_head *sh)
1461	{
1462	struct dma_async_tx_descriptor *tx = NULL;
1463	struct async_submit_ctl submit;
1464	int i;
1465	struct r5conf *conf = sh->raid_conf;
1466
1467	BUG_ON(sh->batch_head);
1468	pr_debug("%s: stripe %llu\n", __func__,
1469	(unsigned long long)sh->sector);
1470
1471	for (i = sh->disks; i--; ) {
1472	struct r5dev *dev = &sh->dev[i];
1473	if (test_bit(R5_Wantfill, &dev->flags)) {
1474	struct bio *rbi;
1475	spin_lock_irq(lock: &sh->stripe_lock);
1476	dev->read = rbi = dev->toread;
1477	dev->toread = NULL;
1478	spin_unlock_irq(lock: &sh->stripe_lock);
1479	while (rbi && rbi->bi_iter.bi_sector <
1480	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1481	tx = async_copy_data(frombio: `0`, bio: rbi, page: &dev->page,
1482	poff: dev->offset,
1483	sector: dev->sector, tx, sh, no_skipcopy: `0`);
1484	rbi = r5_next_bio(conf, bio: rbi, sector: dev->sector);
1485	}
1486	}
1487	}
1488
1489	atomic_inc(v: &sh->count);
1490	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, tx, cb_fn: ops_complete_biofill, cb_param: sh, NULL);
1491	async_trigger_callback(submit: &submit);
1492	}
1493
1494	static void mark_target_uptodate(struct stripe_head sh, int* target)
1495	{
1496	struct r5dev *tgt;
1497
1498	if (target < `0`)
1499	return;
1500
1501	tgt = &sh->dev[target];
1502	set_bit(nr: R5_UPTODATE, addr: &tgt->flags);
1503	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1504	clear_bit(nr: R5_Wantcompute, addr: &tgt->flags);
1505	}
1506
1507	static void ops_complete_compute(void *stripe_head_ref)
1508	{
1509	struct stripe_head *sh = stripe_head_ref;
1510
1511	pr_debug("%s: stripe %llu\n", __func__,
1512	(unsigned long long)sh->sector);
1513
1514	/ mark the computed target(s) as uptodate /
1515	mark_target_uptodate(sh, target: sh->ops.target);
1516	mark_target_uptodate(sh, target: sh->ops.target2);
1517
1518	clear_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
1519	if (sh->check_state == check_state_compute_run)
1520	sh->check_state = check_state_compute_result;
1521	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1522	raid5_release_stripe(sh);
1523	}
1524
1525	/ return a pointer to the address conversion region of the scribble buffer /
1526	static struct page to_addr_page(struct** raid5_percpu percpu, int* i)
1527	{
1528	return percpu->scribble + i * percpu->scribble_obj_size;
1529	}
1530
1531	/ return a pointer to the address conversion region of the scribble buffer /
1532	static addr_conv_t to_addr_conv(struct* stripe_head *sh,
1533	struct raid5_percpu percpu, int* i)
1534	{
1535	return (void *) (to_addr_page(percpu, i) + sh->disks + `2`);
1536	}
1537
1538	/*
1539	* Return a pointer to record offset address.
1540	*/
1541	static unsigned int *
1542	to_addr_offs(struct stripe_head sh, struct* raid5_percpu *percpu)
1543	{
1544	return (unsigned int *) (to_addr_conv(sh, percpu, i: `0`) + sh->disks + `2`);
1545	}
1546
1547	static struct dma_async_tx_descriptor *
1548	ops_run_compute5(struct stripe_head sh, struct* raid5_percpu *percpu)
1549	{
1550	int disks = sh->disks;
1551	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
1552	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1553	int target = sh->ops.target;
1554	struct r5dev *tgt = &sh->dev[target];
1555	struct page *xor_dest = tgt->page;
1556	unsigned int off_dest = tgt->offset;
1557	int count = `0`;
1558	struct dma_async_tx_descriptor *tx;
1559	struct async_submit_ctl submit;
1560	int i;
1561
1562	BUG_ON(sh->batch_head);
1563
1564	pr_debug("%s: stripe %llu block: %d\n",
1565	__func__, (unsigned long long)sh->sector, target);
1566	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1567
1568	for (i = disks; i--; ) {
1569	if (i != target) {
1570	off_srcs[count] = sh->dev[i].offset;
1571	xor_srcs[count++] = sh->dev[i].page;
1572	}
1573	}
1574
1575	atomic_inc(v: &sh->count);
1576
1577	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, NULL,
1578	cb_fn: ops_complete_compute, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1579	if (unlikely(count == `1`))
1580	tx = async_memcpy(dest: xor_dest, src: xor_srcs[`0`], dest_offset: off_dest, src_offset: off_srcs[`0`],
1581	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1582	else
1583	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
1584	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1585
1586	return tx;
1587	}
1588
1589	/ set_syndrome_sources - populate source buffers for gen_syndrome*
1590	* @srcs - (struct page *) array of size sh->disks
1591	* @offs - (unsigned int) array of offset for each page
1592	* @sh - stripe_head to parse
1593	*
1594	* Populates srcs in proper layout order for the stripe and returns the
1595	* 'count' of sources to be used in a call to async_gen_syndrome. The P
1596	* destination buffer is recorded in srcs[count] and the Q destination
1597	* is recorded in srcs[count+1]].
1598	*/
1599	static int set_syndrome_sources(struct page **srcs,
1600	unsigned int *offs,
1601	struct stripe_head *sh,
1602	int srctype)
1603	{
1604	int disks = sh->disks;
1605	int syndrome_disks = sh->ddf_layout ? disks : (disks - `2`);
1606	int d0_idx = raid6_d0(sh);
1607	int count;
1608	int i;
1609
1610	for (i = `0`; i < disks; i++)
1611	srcs[i] = NULL;
1612
1613	count = `0`;
1614	i = d0_idx;
1615	do {
1616	int slot = raid6_idx_to_slot(idx: i, sh, count: &count, syndrome_disks);
1617	struct r5dev *dev = &sh->dev[i];
1618
1619	if (i == sh->qd_idx \|\| i == sh->pd_idx \|\|
1620	(srctype == SYNDROME_SRC_ALL) \|\|
1621	(srctype == SYNDROME_SRC_WANT_DRAIN &&
1622	(test_bit(R5_Wantdrain, &dev->flags) \|\|
1623	test_bit(R5_InJournal, &dev->flags))) \|\|
1624	(srctype == SYNDROME_SRC_WRITTEN &&
1625	(dev->written \|\|
1626	test_bit(R5_InJournal, &dev->flags)))) {
1627	if (test_bit(R5_InJournal, &dev->flags))
1628	srcs[slot] = sh->dev[i].orig_page;
1629	else
1630	srcs[slot] = sh->dev[i].page;
1631	/*
1632	* For R5_InJournal, PAGE_SIZE must be 4KB and will
1633	* not shared page. In that case, dev[i].offset
1634	* is 0.
1635	*/
1636	offs[slot] = sh->dev[i].offset;
1637	}
1638	i = raid6_next_disk(disk: i, raid_disks: disks);
1639	} while (i != d0_idx);
1640
1641	return syndrome_disks;
1642	}
1643
1644	static struct dma_async_tx_descriptor *
1645	ops_run_compute6_1(struct stripe_head sh, struct* raid5_percpu *percpu)
1646	{
1647	int disks = sh->disks;
1648	struct page **blocks = to_addr_page(percpu, i: `0`);
1649	unsigned int *offs = to_addr_offs(sh, percpu);
1650	int target;
1651	int qd_idx = sh->qd_idx;
1652	struct dma_async_tx_descriptor *tx;
1653	struct async_submit_ctl submit;
1654	struct r5dev *tgt;
1655	struct page *dest;
1656	unsigned int dest_off;
1657	int i;
1658	int count;
1659
1660	BUG_ON(sh->batch_head);
1661	if (sh->ops.target < `0`)
1662	target = sh->ops.target2;
1663	else if (sh->ops.target2 < `0`)
1664	target = sh->ops.target;
1665	else
1666	/ we should only have one valid target /
1667	BUG();
1668	BUG_ON(target < `0`);
1669	pr_debug("%s: stripe %llu block: %d\n",
1670	__func__, (unsigned long long)sh->sector, target);
1671
1672	tgt = &sh->dev[target];
1673	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1674	dest = tgt->page;
1675	dest_off = tgt->offset;
1676
1677	atomic_inc(v: &sh->count);
1678
1679	if (target == qd_idx) {
1680	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_ALL);
1681	blocks[count] = NULL; / regenerating p is not necessary /
1682	BUG_ON(blocks[count+`1`] != dest); / q should already be set /
1683	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1684	cb_fn: ops_complete_compute, cb_param: sh,
1685	scribble: to_addr_conv(sh, percpu, i: `0`));
1686	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1687	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1688	} else {
1689	/ Compute any data- or p-drive using XOR /
1690	count = `0`;
1691	for (i = disks; i-- ; ) {
1692	if (i == target \|\| i == qd_idx)
1693	continue;
1694	offs[count] = sh->dev[i].offset;
1695	blocks[count++] = sh->dev[i].page;
1696	}
1697
1698	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST,
1699	NULL, cb_fn: ops_complete_compute, cb_param: sh,
1700	scribble: to_addr_conv(sh, percpu, i: `0`));
1701	tx = async_xor_offs(dest, offset: dest_off, src_list: blocks, src_offset: offs, src_cnt: count,
1702	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1703	}
1704
1705	return tx;
1706	}
1707
1708	static struct dma_async_tx_descriptor *
1709	ops_run_compute6_2(struct stripe_head sh, struct* raid5_percpu *percpu)
1710	{
1711	int i, count, disks = sh->disks;
1712	int syndrome_disks = sh->ddf_layout ? disks : disks-`2`;
1713	int d0_idx = raid6_d0(sh);
1714	int faila = -`1`, failb = -`1`;
1715	int target = sh->ops.target;
1716	int target2 = sh->ops.target2;
1717	struct r5dev *tgt = &sh->dev[target];
1718	struct r5dev *tgt2 = &sh->dev[target2];
1719	struct dma_async_tx_descriptor *tx;
1720	struct page **blocks = to_addr_page(percpu, i: `0`);
1721	unsigned int *offs = to_addr_offs(sh, percpu);
1722	struct async_submit_ctl submit;
1723
1724	BUG_ON(sh->batch_head);
1725	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1726	__func__, (unsigned long long)sh->sector, target, target2);
1727	BUG_ON(target < `0` \|\| target2 < `0`);
1728	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1729	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1730
1731	/ we need to open-code set_syndrome_sources to handle the*
1732	* slot number conversion for 'faila' and 'failb'
1733	*/
1734	for (i = `0`; i < disks ; i++) {
1735	offs[i] = `0`;
1736	blocks[i] = NULL;
1737	}
1738	count = `0`;
1739	i = d0_idx;
1740	do {
1741	int slot = raid6_idx_to_slot(idx: i, sh, count: &count, syndrome_disks);
1742
1743	offs[slot] = sh->dev[i].offset;
1744	blocks[slot] = sh->dev[i].page;
1745
1746	if (i == target)
1747	faila = slot;
1748	if (i == target2)
1749	failb = slot;
1750	i = raid6_next_disk(disk: i, raid_disks: disks);
1751	} while (i != d0_idx);
1752
1753	BUG_ON(faila == failb);
1754	if (failb < faila)
1755	swap(faila, failb);
1756	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1757	__func__, (unsigned long long)sh->sector, faila, failb);
1758
1759	atomic_inc(v: &sh->count);
1760
1761	if (failb == syndrome_disks+`1`) {
1762	/ Q disk is one of the missing disks /
1763	if (faila == syndrome_disks) {
1764	/ Missing P+Q, just recompute /
1765	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1766	cb_fn: ops_complete_compute, cb_param: sh,
1767	scribble: to_addr_conv(sh, percpu, i: `0`));
1768	return async_gen_syndrome(blocks, offsets: offs, src_cnt: syndrome_disks+`2`,
1769	RAID5_STRIPE_SIZE(sh->raid_conf),
1770	submit: &submit);
1771	} else {
1772	struct page *dest;
1773	unsigned int dest_off;
1774	int data_target;
1775	int qd_idx = sh->qd_idx;
1776
1777	/ Missing D+Q: recompute D from P, then recompute Q /
1778	if (target == qd_idx)
1779	data_target = target2;
1780	else
1781	data_target = target;
1782
1783	count = `0`;
1784	for (i = disks; i-- ; ) {
1785	if (i == data_target \|\| i == qd_idx)
1786	continue;
1787	offs[count] = sh->dev[i].offset;
1788	blocks[count++] = sh->dev[i].page;
1789	}
1790	dest = sh->dev[data_target].page;
1791	dest_off = sh->dev[data_target].offset;
1792	init_async_submit(args: &submit,
1793	flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST,
1794	NULL, NULL, NULL,
1795	scribble: to_addr_conv(sh, percpu, i: `0`));
1796	tx = async_xor_offs(dest, offset: dest_off, src_list: blocks, src_offset: offs, src_cnt: count,
1797	RAID5_STRIPE_SIZE(sh->raid_conf),
1798	submit: &submit);
1799
1800	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_ALL);
1801	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, tx,
1802	cb_fn: ops_complete_compute, cb_param: sh,
1803	scribble: to_addr_conv(sh, percpu, i: `0`));
1804	return async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1805	RAID5_STRIPE_SIZE(sh->raid_conf),
1806	submit: &submit);
1807	}
1808	} else {
1809	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1810	cb_fn: ops_complete_compute, cb_param: sh,
1811	scribble: to_addr_conv(sh, percpu, i: `0`));
1812	if (failb == syndrome_disks) {
1813	/ We're missing D+P. /
1814	return async_raid6_datap_recov(src_num: syndrome_disks+`2`,
1815	RAID5_STRIPE_SIZE(sh->raid_conf),
1816	faila,
1817	ptrs: blocks, offs, submit: &submit);
1818	} else {
1819	/ We're missing D+D. /
1820	return async_raid6_2data_recov(src_num: syndrome_disks+`2`,
1821	RAID5_STRIPE_SIZE(sh->raid_conf),
1822	faila, failb,
1823	ptrs: blocks, offs, submit: &submit);
1824	}
1825	}
1826	}
1827
1828	static void ops_complete_prexor(void *stripe_head_ref)
1829	{
1830	struct stripe_head *sh = stripe_head_ref;
1831
1832	pr_debug("%s: stripe %llu\n", __func__,
1833	(unsigned long long)sh->sector);
1834
1835	if (r5c_is_writeback(log: sh->raid_conf->log))
1836	/*
1837	* raid5-cache write back uses orig_page during prexor.
1838	* After prexor, it is time to free orig_page
1839	*/
1840	r5c_release_extra_page(sh);
1841	}
1842
1843	static struct dma_async_tx_descriptor *
1844	ops_run_prexor5(struct stripe_head sh, struct* raid5_percpu *percpu,
1845	struct dma_async_tx_descriptor *tx)
1846	{
1847	int disks = sh->disks;
1848	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
1849	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1850	int count = `0`, pd_idx = sh->pd_idx, i;
1851	struct async_submit_ctl submit;
1852
1853	/ existing parity data subtracted /
1854	unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1855	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1856
1857	BUG_ON(sh->batch_head);
1858	pr_debug("%s: stripe %llu\n", __func__,
1859	(unsigned long long)sh->sector);
1860
1861	for (i = disks; i--; ) {
1862	struct r5dev *dev = &sh->dev[i];
1863	/ Only process blocks that are known to be uptodate /
1864	if (test_bit(R5_InJournal, &dev->flags)) {
1865	/*
1866	* For this case, PAGE_SIZE must be equal to 4KB and
1867	* page offset is zero.
1868	*/
1869	off_srcs[count] = dev->offset;
1870	xor_srcs[count++] = dev->orig_page;
1871	} else if (test_bit(R5_Wantdrain, &dev->flags)) {
1872	off_srcs[count] = dev->offset;
1873	xor_srcs[count++] = dev->page;
1874	}
1875	}
1876
1877	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_DROP_DST, tx,
1878	cb_fn: ops_complete_prexor, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1879	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
1880	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1881
1882	return tx;
1883	}
1884
1885	static struct dma_async_tx_descriptor *
1886	ops_run_prexor6(struct stripe_head sh, struct* raid5_percpu *percpu,
1887	struct dma_async_tx_descriptor *tx)
1888	{
1889	struct page **blocks = to_addr_page(percpu, i: `0`);
1890	unsigned int *offs = to_addr_offs(sh, percpu);
1891	int count;
1892	struct async_submit_ctl submit;
1893
1894	pr_debug("%s: stripe %llu\n", __func__,
1895	(unsigned long long)sh->sector);
1896
1897	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_WANT_DRAIN);
1898
1899	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_PQ_XOR_DST, tx,
1900	cb_fn: ops_complete_prexor, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1901	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1902	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1903
1904	return tx;
1905	}
1906
1907	static struct dma_async_tx_descriptor *
1908	ops_run_biodrain(struct stripe_head sh, struct* dma_async_tx_descriptor *tx)
1909	{
1910	struct r5conf *conf = sh->raid_conf;
1911	int disks = sh->disks;
1912	int i;
1913	struct stripe_head *head_sh = sh;
1914
1915	pr_debug("%s: stripe %llu\n", __func__,
1916	(unsigned long long)sh->sector);
1917
1918	for (i = disks; i--; ) {
1919	struct r5dev *dev;
1920	struct bio *chosen;
1921
1922	sh = head_sh;
1923	if (test_and_clear_bit(nr: R5_Wantdrain, addr: &head_sh->dev[i].flags)) {
1924	struct bio *wbi;
1925
1926	again:
1927	dev = &sh->dev[i];
1928	/*
1929	* clear R5_InJournal, so when rewriting a page in
1930	* journal, it is not skipped by r5l_log_stripe()
1931	*/
1932	clear_bit(nr: R5_InJournal, addr: &dev->flags);
1933	spin_lock_irq(lock: &sh->stripe_lock);
1934	chosen = dev->towrite;
1935	dev->towrite = NULL;
1936	sh->overwrite_disks = `0`;
1937	BUG_ON(dev->written);
1938	wbi = dev->written = chosen;
1939	spin_unlock_irq(lock: &sh->stripe_lock);
1940	WARN_ON(dev->page != dev->orig_page);
1941
1942	while (wbi && wbi->bi_iter.bi_sector <
1943	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1944	if (wbi->bi_opf & REQ_FUA)
1945	set_bit(nr: R5_WantFUA, addr: &dev->flags);
1946	if (wbi->bi_opf & REQ_SYNC)
1947	set_bit(nr: R5_SyncIO, addr: &dev->flags);
1948	if (bio_op(bio: wbi) == REQ_OP_DISCARD)
1949	set_bit(nr: R5_Discard, addr: &dev->flags);
1950	else {
1951	tx = async_copy_data(frombio: `1`, bio: wbi, page: &dev->page,
1952	poff: dev->offset,
1953	sector: dev->sector, tx, sh,
1954	no_skipcopy: r5c_is_writeback(log: conf->log));
1955	if (dev->page != dev->orig_page &&
1956	!r5c_is_writeback(log: conf->log)) {
1957	set_bit(nr: R5_SkipCopy, addr: &dev->flags);
1958	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
1959	clear_bit(nr: R5_OVERWRITE, addr: &dev->flags);
1960	}
1961	}
1962	wbi = r5_next_bio(conf, bio: wbi, sector: dev->sector);
1963	}
1964
1965	if (head_sh->batch_head) {
1966	sh = list_first_entry(&sh->batch_list,
1967	struct stripe_head,
1968	batch_list);
1969	if (sh == head_sh)
1970	continue;
1971	goto again;
1972	}
1973	}
1974	}
1975
1976	return tx;
1977	}
1978
1979	static void ops_complete_reconstruct(void *stripe_head_ref)
1980	{
1981	struct stripe_head *sh = stripe_head_ref;
1982	int disks = sh->disks;
1983	int pd_idx = sh->pd_idx;
1984	int qd_idx = sh->qd_idx;
1985	int i;
1986	bool fua = false, sync = false, discard = false;
1987
1988	pr_debug("%s: stripe %llu\n", __func__,
1989	(unsigned long long)sh->sector);
1990
1991	for (i = disks; i--; ) {
1992	fua \|= test_bit(R5_WantFUA, &sh->dev[i].flags);
1993	sync \|= test_bit(R5_SyncIO, &sh->dev[i].flags);
1994	discard \|= test_bit(R5_Discard, &sh->dev[i].flags);
1995	}
1996
1997	for (i = disks; i--; ) {
1998	struct r5dev *dev = &sh->dev[i];
1999
2000	if (dev->written \|\| i == pd_idx \|\| i == qd_idx) {
2001	if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
2002	set_bit(nr: R5_UPTODATE, addr: &dev->flags);
2003	if (test_bit(STRIPE_EXPAND_READY, &sh->state))
2004	set_bit(nr: R5_Expanded, addr: &dev->flags);
2005	}
2006	if (fua)
2007	set_bit(nr: R5_WantFUA, addr: &dev->flags);
2008	if (sync)
2009	set_bit(nr: R5_SyncIO, addr: &dev->flags);
2010	}
2011	}
2012
2013	if (sh->reconstruct_state == reconstruct_state_drain_run)
2014	sh->reconstruct_state = reconstruct_state_drain_result;
2015	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
2016	sh->reconstruct_state = reconstruct_state_prexor_drain_result;
2017	else {
2018	BUG_ON(sh->reconstruct_state != reconstruct_state_run);
2019	sh->reconstruct_state = reconstruct_state_result;
2020	}
2021
2022	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2023	raid5_release_stripe(sh);
2024	}
2025
2026	static void
2027	ops_run_reconstruct5(struct stripe_head sh, struct* raid5_percpu *percpu,
2028	struct dma_async_tx_descriptor *tx)
2029	{
2030	int disks = sh->disks;
2031	struct page **xor_srcs;
2032	unsigned int *off_srcs;
2033	struct async_submit_ctl submit;
2034	int count, pd_idx = sh->pd_idx, i;
2035	struct page *xor_dest;
2036	unsigned int off_dest;
2037	int prexor = `0`;
2038	unsigned long flags;
2039	int j = `0`;
2040	struct stripe_head *head_sh = sh;
2041	int last_stripe;
2042
2043	pr_debug("%s: stripe %llu\n", __func__,
2044	(unsigned long long)sh->sector);
2045
2046	for (i = `0`; i < sh->disks; i++) {
2047	if (pd_idx == i)
2048	continue;
2049	if (!test_bit(R5_Discard, &sh->dev[i].flags))
2050	break;
2051	}
2052	if (i >= sh->disks) {
2053	atomic_inc(v: &sh->count);
2054	set_bit(nr: R5_Discard, addr: &sh->dev[pd_idx].flags);
2055	ops_complete_reconstruct(stripe_head_ref: sh);
2056	return;
2057	}
2058	again:
2059	count = `0`;
2060	xor_srcs = to_addr_page(percpu, i: j);
2061	off_srcs = to_addr_offs(sh, percpu);
2062	/ check if prexor is active which means only process blocks*
2063	* that are part of a read-modify-write (written)
2064	*/
2065	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2066	prexor = `1`;
2067	off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2068	xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2069	for (i = disks; i--; ) {
2070	struct r5dev *dev = &sh->dev[i];
2071	if (head_sh->dev[i].written \|\|
2072	test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2073	off_srcs[count] = dev->offset;
2074	xor_srcs[count++] = dev->page;
2075	}
2076	}
2077	} else {
2078	xor_dest = sh->dev[pd_idx].page;
2079	off_dest = sh->dev[pd_idx].offset;
2080	for (i = disks; i--; ) {
2081	struct r5dev *dev = &sh->dev[i];
2082	if (i != pd_idx) {
2083	off_srcs[count] = dev->offset;
2084	xor_srcs[count++] = dev->page;
2085	}
2086	}
2087	}
2088
2089	/ 1/ if we prexor'd then the dest is reused as a source*
2090	* 2/ if we did not prexor then we are redoing the parity
2091	* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
2092	* for the synchronous xor case
2093	*/
2094	last_stripe = !head_sh->batch_head \|\|
2095	list_first_entry(&sh->batch_list,
2096	struct stripe_head, batch_list) == head_sh;
2097	if (last_stripe) {
2098	flags = ASYNC_TX_ACK \|
2099	(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2100
2101	atomic_inc(v: &head_sh->count);
2102	init_async_submit(args: &submit, flags, tx, cb_fn: ops_complete_reconstruct, cb_param: head_sh,
2103	scribble: to_addr_conv(sh, percpu, i: j));
2104	} else {
2105	flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2106	init_async_submit(args: &submit, flags, tx, NULL, NULL,
2107	scribble: to_addr_conv(sh, percpu, i: j));
2108	}
2109
2110	if (unlikely(count == `1`))
2111	tx = async_memcpy(dest: xor_dest, src: xor_srcs[`0`], dest_offset: off_dest, src_offset: off_srcs[`0`],
2112	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2113	else
2114	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
2115	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2116	if (!last_stripe) {
2117	j++;
2118	sh = list_first_entry(&sh->batch_list, struct stripe_head,
2119	batch_list);
2120	goto again;
2121	}
2122	}
2123
2124	static void
2125	ops_run_reconstruct6(struct stripe_head sh, struct* raid5_percpu *percpu,
2126	struct dma_async_tx_descriptor *tx)
2127	{
2128	struct async_submit_ctl submit;
2129	struct page **blocks;
2130	unsigned int *offs;
2131	int count, i, j = `0`;
2132	struct stripe_head *head_sh = sh;
2133	int last_stripe;
2134	int synflags;
2135	unsigned long txflags;
2136
2137	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2138
2139	for (i = `0`; i < sh->disks; i++) {
2140	if (sh->pd_idx == i \|\| sh->qd_idx == i)
2141	continue;
2142	if (!test_bit(R5_Discard, &sh->dev[i].flags))
2143	break;
2144	}
2145	if (i >= sh->disks) {
2146	atomic_inc(v: &sh->count);
2147	set_bit(nr: R5_Discard, addr: &sh->dev[sh->pd_idx].flags);
2148	set_bit(nr: R5_Discard, addr: &sh->dev[sh->qd_idx].flags);
2149	ops_complete_reconstruct(stripe_head_ref: sh);
2150	return;
2151	}
2152
2153	again:
2154	blocks = to_addr_page(percpu, i: j);
2155	offs = to_addr_offs(sh, percpu);
2156
2157	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2158	synflags = SYNDROME_SRC_WRITTEN;
2159	txflags = ASYNC_TX_ACK \| ASYNC_TX_PQ_XOR_DST;
2160	} else {
2161	synflags = SYNDROME_SRC_ALL;
2162	txflags = ASYNC_TX_ACK;
2163	}
2164
2165	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: synflags);
2166	last_stripe = !head_sh->batch_head \|\|
2167	list_first_entry(&sh->batch_list,
2168	struct stripe_head, batch_list) == head_sh;
2169
2170	if (last_stripe) {
2171	atomic_inc(v: &head_sh->count);
2172	init_async_submit(args: &submit, flags: txflags, tx, cb_fn: ops_complete_reconstruct,
2173	cb_param: head_sh, scribble: to_addr_conv(sh, percpu, i: j));
2174	} else
2175	init_async_submit(args: &submit, flags: `0`, tx, NULL, NULL,
2176	scribble: to_addr_conv(sh, percpu, i: j));
2177	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
2178	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2179	if (!last_stripe) {
2180	j++;
2181	sh = list_first_entry(&sh->batch_list, struct stripe_head,
2182	batch_list);
2183	goto again;
2184	}
2185	}
2186
2187	static void ops_complete_check(void *stripe_head_ref)
2188	{
2189	struct stripe_head *sh = stripe_head_ref;
2190
2191	pr_debug("%s: stripe %llu\n", __func__,
2192	(unsigned long long)sh->sector);
2193
2194	sh->check_state = check_state_check_result;
2195	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2196	raid5_release_stripe(sh);
2197	}
2198
2199	static void ops_run_check_p(struct stripe_head sh, struct* raid5_percpu *percpu)
2200	{
2201	int disks = sh->disks;
2202	int pd_idx = sh->pd_idx;
2203	int qd_idx = sh->qd_idx;
2204	struct page *xor_dest;
2205	unsigned int off_dest;
2206	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
2207	unsigned int *off_srcs = to_addr_offs(sh, percpu);
2208	struct dma_async_tx_descriptor *tx;
2209	struct async_submit_ctl submit;
2210	int count;
2211	int i;
2212
2213	pr_debug("%s: stripe %llu\n", __func__,
2214	(unsigned long long)sh->sector);
2215
2216	BUG_ON(sh->batch_head);
2217	count = `0`;
2218	xor_dest = sh->dev[pd_idx].page;
2219	off_dest = sh->dev[pd_idx].offset;
2220	off_srcs[count] = off_dest;
2221	xor_srcs[count++] = xor_dest;
2222	for (i = disks; i--; ) {
2223	if (i == pd_idx \|\| i == qd_idx)
2224	continue;
2225	off_srcs[count] = sh->dev[i].offset;
2226	xor_srcs[count++] = sh->dev[i].page;
2227	}
2228
2229	init_async_submit(args: &submit, flags: `0`, NULL, NULL, NULL,
2230	scribble: to_addr_conv(sh, percpu, i: `0`));
2231	tx = async_xor_val_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
2232	RAID5_STRIPE_SIZE(sh->raid_conf),
2233	result: &sh->ops.zero_sum_result, submit: &submit);
2234
2235	atomic_inc(v: &sh->count);
2236	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, tx, cb_fn: ops_complete_check, cb_param: sh, NULL);
2237	tx = async_trigger_callback(submit: &submit);
2238	}
2239
2240	static void ops_run_check_pq(struct stripe_head sh, struct* raid5_percpu percpu, int* checkp)
2241	{
2242	struct page **srcs = to_addr_page(percpu, i: `0`);
2243	unsigned int *offs = to_addr_offs(sh, percpu);
2244	struct async_submit_ctl submit;
2245	int count;
2246
2247	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2248	(unsigned long long)sh->sector, checkp);
2249
2250	BUG_ON(sh->batch_head);
2251	count = set_syndrome_sources(srcs, offs, sh, srctype: SYNDROME_SRC_ALL);
2252	if (!checkp)
2253	srcs[count] = NULL;
2254
2255	atomic_inc(v: &sh->count);
2256	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, NULL, cb_fn: ops_complete_check,
2257	cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
2258	async_syndrome_val(blocks: srcs, offsets: offs, src_cnt: count+`2`,
2259	RAID5_STRIPE_SIZE(sh->raid_conf),
2260	pqres: &sh->ops.zero_sum_result, spare: percpu->spare_page, s_off: `0`, submit: &submit);
2261	}
2262
2263	static void raid_run_ops(struct stripe_head sh, unsigned* long ops_request)
2264	{
2265	int overlap_clear = `0`, i, disks = sh->disks;
2266	struct dma_async_tx_descriptor *tx = NULL;
2267	struct r5conf *conf = sh->raid_conf;
2268	int level = conf->level;
2269	struct raid5_percpu *percpu;
2270
2271	local_lock(&conf->percpu->lock);
2272	percpu = this_cpu_ptr(conf->percpu);
2273	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2274	ops_run_biofill(sh);
2275	overlap_clear++;
2276	}
2277
2278	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2279	if (level < `6`)
2280	tx = ops_run_compute5(sh, percpu);
2281	else {
2282	if (sh->ops.target2 < `0` \|\| sh->ops.target < `0`)
2283	tx = ops_run_compute6_1(sh, percpu);
2284	else
2285	tx = ops_run_compute6_2(sh, percpu);
2286	}
2287	/ terminate the chain if reconstruct is not set to be run /
2288	if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2289	async_tx_ack(tx);
2290	}
2291
2292	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2293	if (level < `6`)
2294	tx = ops_run_prexor5(sh, percpu, tx);
2295	else
2296	tx = ops_run_prexor6(sh, percpu, tx);
2297	}
2298
2299	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2300	tx = ops_run_partial_parity(sh, percpu, tx);
2301
2302	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2303	tx = ops_run_biodrain(sh, tx);
2304	overlap_clear++;
2305	}
2306
2307	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2308	if (level < `6`)
2309	ops_run_reconstruct5(sh, percpu, tx);
2310	else
2311	ops_run_reconstruct6(sh, percpu, tx);
2312	}
2313
2314	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2315	if (sh->check_state == check_state_run)
2316	ops_run_check_p(sh, percpu);
2317	else if (sh->check_state == check_state_run_q)
2318	ops_run_check_pq(sh, percpu, checkp: `0`);
2319	else if (sh->check_state == check_state_run_pq)
2320	ops_run_check_pq(sh, percpu, checkp: `1`);
2321	else
2322	BUG();
2323	}
2324
2325	if (overlap_clear && !sh->batch_head) {
2326	for (i = disks; i--; ) {
2327	struct r5dev *dev = &sh->dev[i];
2328	if (test_and_clear_bit(nr: R5_Overlap, addr: &dev->flags))
2329	wake_up_bit(word: &dev->flags, bit: R5_Overlap);
2330	}
2331	}
2332	local_unlock(&conf->percpu->lock);
2333	}
2334
2335	static void free_stripe(struct kmem_cache sc, struct* stripe_head *sh)
2336	{
2337	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2338	kfree(sh->pages);
2339	#endif
2340	if (sh->ppl_page)
2341	__free_page(sh->ppl_page);
2342	kmem_cache_free(s: sc, objp: sh);
2343	}
2344
2345	static struct stripe_head alloc_stripe(struct* kmem_cache *sc, gfp_t gfp,
2346	int disks, struct r5conf *conf)
2347	{
2348	struct stripe_head *sh;
2349
2350	sh = kmem_cache_zalloc(sc, gfp);
2351	if (sh) {
2352	spin_lock_init(&sh->stripe_lock);
2353	spin_lock_init(&sh->batch_lock);
2354	INIT_LIST_HEAD(list: &sh->batch_list);
2355	INIT_LIST_HEAD(list: &sh->lru);
2356	INIT_LIST_HEAD(list: &sh->r5c);
2357	INIT_LIST_HEAD(list: &sh->log_list);
2358	atomic_set(v: &sh->count, i: `1`);
2359	sh->raid_conf = conf;
2360	sh->log_start = MaxSector;
2361
2362	if (raid5_has_ppl(conf)) {
2363	sh->ppl_page = alloc_page(gfp);
2364	if (!sh->ppl_page) {
2365	free_stripe(sc, sh);
2366	return NULL;
2367	}
2368	}
2369	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2370	if (init_stripe_shared_pages(sh, conf, disks)) {
2371	free_stripe(sc, sh);
2372	return NULL;
2373	}
2374	#endif
2375	}
2376	return sh;
2377	}
2378	static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2379	{
2380	struct stripe_head *sh;
2381
2382	sh = alloc_stripe(sc: conf->slab_cache, gfp, disks: conf->pool_size, conf);
2383	if (!sh)
2384	return `0`;
2385
2386	if (grow_buffers(sh, gfp)) {
2387	shrink_buffers(sh);
2388	free_stripe(sc: conf->slab_cache, sh);
2389	return `0`;
2390	}
2391	sh->hash_lock_index =
2392	conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2393	/ we just created an active stripe so... /
2394	atomic_inc(v: &conf->active_stripes);
2395
2396	raid5_release_stripe(sh);
2397	WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + `1`);
2398	return `1`;
2399	}
2400
2401	static int grow_stripes(struct r5conf conf, int* num)
2402	{
2403	struct kmem_cache *sc;
2404	size_t namelen = sizeof(conf->cache_name[`0`]);
2405	int devs = max(conf->raid_disks, conf->previous_raid_disks);
2406
2407	if (mddev_is_dm(mddev: conf->mddev))
2408	snprintf(buf: conf->cache_name[`0`], size: namelen,
2409	fmt: "raid%d-%p", conf->level, conf->mddev);
2410	else
2411	snprintf(buf: conf->cache_name[`0`], size: namelen,
2412	fmt: "raid%d-%s", conf->level, mdname(mddev: conf->mddev));
2413	snprintf(buf: conf->cache_name[`1`], size: namelen, fmt: "%.27s-alt", conf->cache_name[`0`]);
2414
2415	conf->active_name = `0`;
2416	sc = kmem_cache_create(conf->cache_name[conf->active_name],
2417	struct_size_t(struct stripe_head, dev, devs),
2418	`0`, `0`, NULL);
2419	if (!sc)
2420	return `1`;
2421	conf->slab_cache = sc;
2422	conf->pool_size = devs;
2423	while (num--)
2424	if (!grow_one_stripe(conf, GFP_KERNEL))
2425	return `1`;
2426
2427	return `0`;
2428	}
2429
2430	/**
2431	* scribble_alloc - allocate percpu scribble buffer for required size
2432	* of the scribble region
2433	* @percpu: from for_each_present_cpu() of the caller
2434	* @num: total number of disks in the array
2435	* @cnt: scribble objs count for required size of the scribble region
2436	*
2437	* The scribble buffer size must be enough to contain:
2438	* 1/ a struct page pointer for each device in the array +2
2439	* 2/ room to convert each entry in (1) to its corresponding dma
2440	* (dma_map_page()) or page (page_address()) address.
2441	*
2442	* Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2443	* calculate over all devices (not just the data blocks), using zeros in place
2444	* of the P and Q blocks.
2445	*/
2446	static int scribble_alloc(struct raid5_percpu *percpu,
2447	int num, int cnt)
2448	{
2449	size_t obj_size =
2450	sizeof(struct page ) (num + `2`) +
2451	sizeof(addr_conv_t) * (num + `2`) +
2452	sizeof(unsigned int) * (num + `2`);
2453	void *scribble;
2454
2455	/*
2456	* If here is in raid array suspend context, it is in memalloc noio
2457	* context as well, there is no potential recursive memory reclaim
2458	* I/Os with the GFP_KERNEL flag.
2459	*/
2460	scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2461	if (!scribble)
2462	return -ENOMEM;
2463
2464	kvfree(addr: percpu->scribble);
2465
2466	percpu->scribble = scribble;
2467	percpu->scribble_obj_size = obj_size;
2468	return `0`;
2469	}
2470
2471	static int resize_chunks(struct r5conf conf, int* new_disks, int new_sectors)
2472	{
2473	unsigned long cpu;
2474	int err = `0`;
2475
2476	/ Never shrink. /
2477	if (conf->scribble_disks >= new_disks &&
2478	conf->scribble_sectors >= new_sectors)
2479	return `0`;
2480
2481	raid5_quiesce(mddev: conf->mddev, quiesce: true);
2482	cpus_read_lock();
2483
2484	for_each_present_cpu(cpu) {
2485	struct raid5_percpu *percpu;
2486
2487	percpu = per_cpu_ptr(conf->percpu, cpu);
2488	err = scribble_alloc(percpu, num: new_disks,
2489	cnt: new_sectors / RAID5_STRIPE_SECTORS(conf));
2490	if (err)
2491	break;
2492	}
2493
2494	cpus_read_unlock();
2495	raid5_quiesce(mddev: conf->mddev, quiesce: false);
2496
2497	if (!err) {
2498	conf->scribble_disks = new_disks;
2499	conf->scribble_sectors = new_sectors;
2500	}
2501	return err;
2502	}
2503
2504	static int resize_stripes(struct r5conf conf, int* newsize)
2505	{
2506	/ Make all the stripes able to hold 'newsize' devices.*
2507	* New slots in each stripe get 'page' set to a new page.
2508	*
2509	* This happens in stages:
2510	* 1/ create a new kmem_cache and allocate the required number of
2511	* stripe_heads.
2512	* 2/ gather all the old stripe_heads and transfer the pages across
2513	* to the new stripe_heads. This will have the side effect of
2514	* freezing the array as once all stripe_heads have been collected,
2515	* no IO will be possible. Old stripe heads are freed once their
2516	* pages have been transferred over, and the old kmem_cache is
2517	* freed when all stripes are done.
2518	* 3/ reallocate conf->disks to be suitable bigger. If this fails,
2519	* we simple return a failure status - no need to clean anything up.
2520	* 4/ allocate new pages for the new slots in the new stripe_heads.
2521	* If this fails, we don't bother trying the shrink the
2522	* stripe_heads down again, we just leave them as they are.
2523	* As each stripe_head is processed the new one is released into
2524	* active service.
2525	*
2526	* Once step2 is started, we cannot afford to wait for a write,
2527	* so we use GFP_NOIO allocations.
2528	*/
2529	struct stripe_head osh, nsh;
2530	LIST_HEAD(newstripes);
2531	struct disk_info *ndisks;
2532	int err = `0`;
2533	struct kmem_cache *sc;
2534	int i;
2535	int hash, cnt;
2536
2537	md_allow_write(mddev: conf->mddev);
2538
2539	/ Step 1 /
2540	sc = kmem_cache_create(conf->cache_name[`1`-conf->active_name],
2541	struct_size_t(struct stripe_head, dev, newsize),
2542	`0`, `0`, NULL);
2543	if (!sc)
2544	return -ENOMEM;
2545
2546	/ Need to ensure auto-resizing doesn't interfere /
2547	mutex_lock(&conf->cache_size_mutex);
2548
2549	for (i = conf->max_nr_stripes; i; i--) {
2550	nsh = alloc_stripe(sc, GFP_KERNEL, disks: newsize, conf);
2551	if (!nsh)
2552	break;
2553
2554	list_add(new: &nsh->lru, head: &newstripes);
2555	}
2556	if (i) {
2557	/ didn't get enough, give up /
2558	while (!list_empty(head: &newstripes)) {
2559	nsh = list_entry(newstripes.next, struct stripe_head, lru);
2560	list_del(entry: &nsh->lru);
2561	free_stripe(sc, sh: nsh);
2562	}
2563	kmem_cache_destroy(s: sc);
2564	mutex_unlock(lock: &conf->cache_size_mutex);
2565	return -ENOMEM;
2566	}
2567	/ Step 2 - Must use GFP_NOIO now.*
2568	* OK, we have enough stripes, start collecting inactive
2569	* stripes and copying them over
2570	*/
2571	hash = `0`;
2572	cnt = `0`;
2573	list_for_each_entry(nsh, &newstripes, lru) {
2574	lock_device_hash_lock(conf, hash);
2575	wait_event_cmd(conf->wait_for_stripe,
2576	!list_empty(conf->inactive_list + hash),
2577	unlock_device_hash_lock(conf, hash),
2578	lock_device_hash_lock(conf, hash));
2579	osh = get_free_stripe(conf, hash);
2580	unlock_device_hash_lock(conf, hash);
2581
2582	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2583	for (i = `0`; i < osh->nr_pages; i++) {
2584	nsh->pages[i] = osh->pages[i];
2585	osh->pages[i] = NULL;
2586	}
2587	#endif
2588	for(i=`0`; i<conf->pool_size; i++) {
2589	nsh->dev[i].page = osh->dev[i].page;
2590	nsh->dev[i].orig_page = osh->dev[i].page;
2591	nsh->dev[i].offset = osh->dev[i].offset;
2592	}
2593	nsh->hash_lock_index = hash;
2594	free_stripe(sc: conf->slab_cache, sh: osh);
2595	cnt++;
2596	if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2597	!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2598	hash++;
2599	cnt = `0`;
2600	}
2601	}
2602	kmem_cache_destroy(s: conf->slab_cache);
2603
2604	/ Step 3.*
2605	* At this point, we are holding all the stripes so the array
2606	* is completely stalled, so now is a good time to resize
2607	* conf->disks and the scribble region
2608	*/
2609	ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2610	if (ndisks) {
2611	for (i = `0`; i < conf->pool_size; i++)
2612	ndisks[i] = conf->disks[i];
2613
2614	for (i = conf->pool_size; i < newsize; i++) {
2615	ndisks[i].extra_page = alloc_page(GFP_NOIO);
2616	if (!ndisks[i].extra_page)
2617	err = -ENOMEM;
2618	}
2619
2620	if (err) {
2621	for (i = conf->pool_size; i < newsize; i++)
2622	if (ndisks[i].extra_page)
2623	put_page(page: ndisks[i].extra_page);
2624	kfree(objp: ndisks);
2625	} else {
2626	kfree(objp: conf->disks);
2627	conf->disks = ndisks;
2628	}
2629	} else
2630	err = -ENOMEM;
2631
2632	conf->slab_cache = sc;
2633	conf->active_name = `1`-conf->active_name;
2634
2635	/ Step 4, return new stripes to service /
2636	while(!list_empty(head: &newstripes)) {
2637	nsh = list_entry(newstripes.next, struct stripe_head, lru);
2638	list_del_init(entry: &nsh->lru);
2639
2640	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2641	for (i = `0`; i < nsh->nr_pages; i++) {
2642	if (nsh->pages[i])
2643	continue;
2644	nsh->pages[i] = alloc_page(GFP_NOIO);
2645	if (!nsh->pages[i])
2646	err = -ENOMEM;
2647	}
2648
2649	for (i = conf->raid_disks; i < newsize; i++) {
2650	if (nsh->dev[i].page)
2651	continue;
2652	nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2653	nsh->dev[i].orig_page = nsh->dev[i].page;
2654	nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2655	}
2656	#else
2657	for (i=conf->raid_disks; i < newsize; i++)
2658	if (nsh->dev[i].page == NULL) {
2659	struct page *p = alloc_page(GFP_NOIO);
2660	nsh->dev[i].page = p;
2661	nsh->dev[i].orig_page = p;
2662	nsh->dev[i].offset = `0`;
2663	if (!p)
2664	err = -ENOMEM;
2665	}
2666	#endif
2667	raid5_release_stripe(sh: nsh);
2668	}
2669	/ critical section pass, GFP_NOIO no longer needed /
2670
2671	if (!err)
2672	conf->pool_size = newsize;
2673	mutex_unlock(lock: &conf->cache_size_mutex);
2674
2675	return err;
2676	}
2677
2678	static int drop_one_stripe(struct r5conf *conf)
2679	{
2680	struct stripe_head *sh;
2681	int hash = (conf->max_nr_stripes - `1`) & STRIPE_HASH_LOCKS_MASK;
2682
2683	spin_lock_irq(lock: conf->hash_locks + hash);
2684	sh = get_free_stripe(conf, hash);
2685	spin_unlock_irq(lock: conf->hash_locks + hash);
2686	if (!sh)
2687	return `0`;
2688	BUG_ON(atomic_read(&sh->count));
2689	shrink_buffers(sh);
2690	free_stripe(sc: conf->slab_cache, sh);
2691	atomic_dec(v: &conf->active_stripes);
2692	WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - `1`);
2693	return `1`;
2694	}
2695
2696	static void shrink_stripes(struct r5conf *conf)
2697	{
2698	while (conf->max_nr_stripes &&
2699	drop_one_stripe(conf))
2700	;
2701
2702	kmem_cache_destroy(s: conf->slab_cache);
2703	conf->slab_cache = NULL;
2704	}
2705
2706	static void raid5_end_read_request(struct bio * bi)
2707	{
2708	struct stripe_head *sh = bi->bi_private;
2709	struct r5conf *conf = sh->raid_conf;
2710	int disks = sh->disks, i;
2711	struct md_rdev *rdev = NULL;
2712	sector_t s;
2713
2714	for (i=`0` ; i<disks; i++)
2715	if (bi == &sh->dev[i].req)
2716	break;
2717
2718	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2719	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2720	bi->bi_status);
2721	if (i == disks) {
2722	BUG();
2723	return;
2724	}
2725	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2726	/ If replacement finished while this request was outstanding,*
2727	* 'replacement' might be NULL already.
2728	* In that case it moved down to 'rdev'.
2729	* rdev is not removed until all requests are finished.
2730	*/
2731	rdev = conf->disks[i].replacement;
2732	if (!rdev)
2733	rdev = conf->disks[i].rdev;
2734
2735	if (use_new_offset(conf, sh))
2736	s = sh->sector + rdev->new_data_offset;
2737	else
2738	s = sh->sector + rdev->data_offset;
2739	if (!bi->bi_status) {
2740	set_bit(nr: R5_UPTODATE, addr: &sh->dev[i].flags);
2741	if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2742	/ Note that this cannot happen on a*
2743	* replacement device. We just fail those on
2744	* any error
2745	*/
2746	pr_info_ratelimited(
2747	"md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n",
2748	mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2749	(unsigned long long)s,
2750	rdev->bdev);
2751	atomic_add(RAID5_STRIPE_SECTORS(conf), v: &rdev->corrected_errors);
2752	clear_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2753	clear_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2754	} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2755	clear_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2756
2757	if (test_bit(R5_InJournal, &sh->dev[i].flags))
2758	/*
2759	* end read for a page in journal, this
2760	* must be preparing for prexor in rmw
2761	*/
2762	set_bit(nr: R5_OrigPageUPTDODATE, addr: &sh->dev[i].flags);
2763
2764	if (atomic_read(v: &rdev->read_errors))
2765	atomic_set(v: &rdev->read_errors, i: `0`);
2766	} else {
2767	int retry = `0`;
2768	int set_bad = `0`;
2769
2770	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[i].flags);
2771	if (!(bi->bi_status == BLK_STS_PROTECTION))
2772	atomic_inc(v: &rdev->read_errors);
2773	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2774	pr_warn_ratelimited(
2775	"md/raid:%s: read error on replacement device (sector %llu on %pg).\n",
2776	mdname(conf->mddev),
2777	(unsigned long long)s,
2778	rdev->bdev);
2779	else if (conf->mddev->degraded >= conf->max_degraded) {
2780	set_bad = `1`;
2781	pr_warn_ratelimited(
2782	"md/raid:%s: read error not correctable (sector %llu on %pg).\n",
2783	mdname(conf->mddev),
2784	(unsigned long long)s,
2785	rdev->bdev);
2786	} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2787	/ Oh, no!!! /
2788	set_bad = `1`;
2789	pr_warn_ratelimited(
2790	"md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n",
2791	mdname(conf->mddev),
2792	(unsigned long long)s,
2793	rdev->bdev);
2794	} else if (atomic_read(v: &rdev->read_errors)
2795	> conf->max_nr_stripes) {
2796	if (!test_bit(Faulty, &rdev->flags)) {
2797	pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2798	mdname(conf->mddev),
2799	atomic_read(&rdev->read_errors),
2800	conf->max_nr_stripes);
2801	pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n",
2802	mdname(conf->mddev), rdev->bdev);
2803	}
2804	} else
2805	retry = `1`;
2806	if (set_bad && test_bit(In_sync, &rdev->flags)
2807	&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2808	retry = `1`;
2809	if (retry)
2810	if (sh->qd_idx >= `0` && sh->pd_idx == i)
2811	set_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2812	else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2813	set_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2814	clear_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2815	} else
2816	set_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2817	else {
2818	clear_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2819	clear_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2820	if (!(set_bad
2821	&& test_bit(In_sync, &rdev->flags)
2822	&& rdev_set_badblocks(
2823	rdev, s: sh->sector, RAID5_STRIPE_SECTORS(conf), is_new: `0`)))
2824	md_error(mddev: conf->mddev, rdev);
2825	}
2826	}
2827	rdev_dec_pending(rdev, mddev: conf->mddev);
2828	bio_uninit(bi);
2829	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
2830	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2831	raid5_release_stripe(sh);
2832	}
2833
2834	static void raid5_end_write_request(struct bio *bi)
2835	{
2836	struct stripe_head *sh = bi->bi_private;
2837	struct r5conf *conf = sh->raid_conf;
2838	int disks = sh->disks, i;
2839	struct md_rdev *rdev;
2840	int replacement = `0`;
2841
2842	for (i = `0` ; i < disks; i++) {
2843	if (bi == &sh->dev[i].req) {
2844	rdev = conf->disks[i].rdev;
2845	break;
2846	}
2847	if (bi == &sh->dev[i].rreq) {
2848	rdev = conf->disks[i].replacement;
2849	if (rdev)
2850	replacement = `1`;
2851	else
2852	/ rdev was removed and 'replacement'*
2853	* replaced it. rdev is not removed
2854	* until all requests are finished.
2855	*/
2856	rdev = conf->disks[i].rdev;
2857	break;
2858	}
2859	}
2860	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2861	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2862	bi->bi_status);
2863	if (i == disks) {
2864	BUG();
2865	return;
2866	}
2867
2868	if (replacement) {
2869	if (bi->bi_status)
2870	md_error(mddev: conf->mddev, rdev);
2871	else if (rdev_has_badblock(rdev, s: sh->sector,
2872	RAID5_STRIPE_SECTORS(conf)))
2873	set_bit(nr: R5_MadeGoodRepl, addr: &sh->dev[i].flags);
2874	} else {
2875	if (bi->bi_status) {
2876	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
2877	set_bit(nr: R5_WriteError, addr: &sh->dev[i].flags);
2878	if (!test_and_set_bit(nr: WantReplacement, addr: &rdev->flags))
2879	set_bit(nr: MD_RECOVERY_NEEDED,
2880	addr: &rdev->mddev->recovery);
2881	} else if (rdev_has_badblock(rdev, s: sh->sector,
2882	RAID5_STRIPE_SECTORS(conf))) {
2883	set_bit(nr: R5_MadeGood, addr: &sh->dev[i].flags);
2884	if (test_bit(R5_ReadError, &sh->dev[i].flags))
2885	/ That was a successful write so make*
2886	* sure it looks like we already did
2887	* a re-write.
2888	*/
2889	set_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2890	}
2891	}
2892	rdev_dec_pending(rdev, mddev: conf->mddev);
2893
2894	if (sh->batch_head && bi->bi_status && !replacement)
2895	set_bit(nr: STRIPE_BATCH_ERR, addr: &sh->batch_head->state);
2896
2897	bio_uninit(bi);
2898	if (!test_and_clear_bit(nr: R5_DOUBLE_LOCKED, addr: &sh->dev[i].flags))
2899	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
2900	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2901
2902	if (sh->batch_head && sh != sh->batch_head)
2903	raid5_release_stripe(sh: sh->batch_head);
2904	raid5_release_stripe(sh);
2905	}
2906
2907	static void raid5_error(struct mddev mddev, struct* md_rdev *rdev)
2908	{
2909	struct r5conf *conf = mddev->private;
2910	unsigned long flags;
2911	pr_debug("raid456: error called\n");
2912
2913	pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n",
2914	mdname(mddev), rdev->bdev);
2915
2916	spin_lock_irqsave(&conf->device_lock, flags);
2917	set_bit(nr: Faulty, addr: &rdev->flags);
2918	clear_bit(nr: In_sync, addr: &rdev->flags);
2919	mddev->degraded = raid5_calc_degraded(conf);
2920
2921	if (has_failed(conf)) {
2922	set_bit(nr: MD_BROKEN, addr: &conf->mddev->flags);
2923	conf->recovery_disabled = mddev->recovery_disabled;
2924
2925	pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2926	mdname(mddev), mddev->degraded, conf->raid_disks);
2927	} else {
2928	pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2929	mdname(mddev), conf->raid_disks - mddev->degraded);
2930	}
2931
2932	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2933	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
2934
2935	set_bit(nr: Blocked, addr: &rdev->flags);
2936	set_mask_bits(&mddev->sb_flags, `0`,
2937	BIT(MD_SB_CHANGE_DEVS) \| BIT(MD_SB_CHANGE_PENDING));
2938	r5c_update_on_rdev_error(mddev, rdev);
2939	}
2940
2941	/*
2942	* Input: a 'big' sector number,
2943	* Output: index of the data and parity disk, and the sector # in them.
2944	*/
2945	sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2946	int previous, int *dd_idx,
2947	struct stripe_head *sh)
2948	{
2949	sector_t stripe, stripe2;
2950	sector_t chunk_number;
2951	unsigned int chunk_offset;
2952	int pd_idx, qd_idx;
2953	int ddf_layout = `0`;
2954	sector_t new_sector;
2955	int algorithm = previous ? conf->prev_algo
2956	: conf->algorithm;
2957	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2958	: conf->chunk_sectors;
2959	int raid_disks = previous ? conf->previous_raid_disks
2960	: conf->raid_disks;
2961	int data_disks = raid_disks - conf->max_degraded;
2962
2963	/ First compute the information on this sector /
2964
2965	/*
2966	* Compute the chunk number and the sector offset inside the chunk
2967	*/
2968	chunk_offset = sector_div(r_sector, sectors_per_chunk);
2969	chunk_number = r_sector;
2970
2971	/*
2972	* Compute the stripe number
2973	*/
2974	stripe = chunk_number;
2975	*dd_idx = sector_div(stripe, data_disks);
2976	stripe2 = stripe;
2977	/*
2978	* Select the parity disk based on the user selected algorithm.
2979	*/
2980	pd_idx = qd_idx = -`1`;
2981	switch(conf->level) {
2982	case `4`:
2983	pd_idx = data_disks;
2984	break;
2985	case `5`:
2986	switch (algorithm) {
2987	case ALGORITHM_LEFT_ASYMMETRIC:
2988	pd_idx = data_disks - sector_div(stripe2, raid_disks);
2989	if (*dd_idx >= pd_idx)
2990	(*dd_idx)++;
2991	break;
2992	case ALGORITHM_RIGHT_ASYMMETRIC:
2993	pd_idx = sector_div(stripe2, raid_disks);
2994	if (*dd_idx >= pd_idx)
2995	(*dd_idx)++;
2996	break;
2997	case ALGORITHM_LEFT_SYMMETRIC:
2998	pd_idx = data_disks - sector_div(stripe2, raid_disks);
2999	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3000	break;
3001	case ALGORITHM_RIGHT_SYMMETRIC:
3002	pd_idx = sector_div(stripe2, raid_disks);
3003	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3004	break;
3005	case ALGORITHM_PARITY_0:
3006	pd_idx = `0`;
3007	(*dd_idx)++;
3008	break;
3009	case ALGORITHM_PARITY_N:
3010	pd_idx = data_disks;
3011	break;
3012	default:
3013	BUG();
3014	}
3015	break;
3016	case `6`:
3017
3018	switch (algorithm) {
3019	case ALGORITHM_LEFT_ASYMMETRIC:
3020	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3021	qd_idx = pd_idx + `1`;
3022	if (pd_idx == raid_disks-`1`) {
3023	(dd_idx)++; /* Q D D D P /
3024	qd_idx = `0`;
3025	} else if (*dd_idx >= pd_idx)
3026	(dd_idx) += `2`; /* D D P Q D /
3027	break;
3028	case ALGORITHM_RIGHT_ASYMMETRIC:
3029	pd_idx = sector_div(stripe2, raid_disks);
3030	qd_idx = pd_idx + `1`;
3031	if (pd_idx == raid_disks-`1`) {
3032	(dd_idx)++; /* Q D D D P /
3033	qd_idx = `0`;
3034	} else if (*dd_idx >= pd_idx)
3035	(dd_idx) += `2`; /* D D P Q D /
3036	break;
3037	case ALGORITHM_LEFT_SYMMETRIC:
3038	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3039	qd_idx = (pd_idx + `1`) % raid_disks;
3040	dd_idx = (pd_idx + `2` + dd_idx) % raid_disks;
3041	break;
3042	case ALGORITHM_RIGHT_SYMMETRIC:
3043	pd_idx = sector_div(stripe2, raid_disks);
3044	qd_idx = (pd_idx + `1`) % raid_disks;
3045	dd_idx = (pd_idx + `2` + dd_idx) % raid_disks;
3046	break;
3047
3048	case ALGORITHM_PARITY_0:
3049	pd_idx = `0`;
3050	qd_idx = `1`;
3051	(*dd_idx) += `2`;
3052	break;
3053	case ALGORITHM_PARITY_N:
3054	pd_idx = data_disks;
3055	qd_idx = data_disks + `1`;
3056	break;
3057
3058	case ALGORITHM_ROTATING_ZERO_RESTART:
3059	/ Exactly the same as RIGHT_ASYMMETRIC, but or*
3060	* of blocks for computing Q is different.
3061	*/
3062	pd_idx = sector_div(stripe2, raid_disks);
3063	qd_idx = pd_idx + `1`;
3064	if (pd_idx == raid_disks-`1`) {
3065	(dd_idx)++; /* Q D D D P /
3066	qd_idx = `0`;
3067	} else if (*dd_idx >= pd_idx)
3068	(dd_idx) += `2`; /* D D P Q D /
3069	ddf_layout = `1`;
3070	break;
3071
3072	case ALGORITHM_ROTATING_N_RESTART:
3073	/ Same a left_asymmetric, by first stripe is*
3074	* D D D P Q rather than
3075	* Q D D D P
3076	*/
3077	stripe2 += `1`;
3078	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3079	qd_idx = pd_idx + `1`;
3080	if (pd_idx == raid_disks-`1`) {
3081	(dd_idx)++; /* Q D D D P /
3082	qd_idx = `0`;
3083	} else if (*dd_idx >= pd_idx)
3084	(dd_idx) += `2`; /* D D P Q D /
3085	ddf_layout = `1`;
3086	break;
3087
3088	case ALGORITHM_ROTATING_N_CONTINUE:
3089	/ Same as left_symmetric but Q is before P /
3090	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3091	qd_idx = (pd_idx + raid_disks - `1`) % raid_disks;
3092	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3093	ddf_layout = `1`;
3094	break;
3095
3096	case ALGORITHM_LEFT_ASYMMETRIC_6:
3097	/ RAID5 left_asymmetric, with Q on last device /
3098	pd_idx = data_disks - sector_div(stripe2, raid_disks-`1`);
3099	if (*dd_idx >= pd_idx)
3100	(*dd_idx)++;
3101	qd_idx = raid_disks - `1`;
3102	break;
3103
3104	case ALGORITHM_RIGHT_ASYMMETRIC_6:
3105	pd_idx = sector_div(stripe2, raid_disks-`1`);
3106	if (*dd_idx >= pd_idx)
3107	(*dd_idx)++;
3108	qd_idx = raid_disks - `1`;
3109	break;
3110
3111	case ALGORITHM_LEFT_SYMMETRIC_6:
3112	pd_idx = data_disks - sector_div(stripe2, raid_disks-`1`);
3113	dd_idx = (pd_idx + `1` + dd_idx) % (raid_disks-`1`);
3114	qd_idx = raid_disks - `1`;
3115	break;
3116
3117	case ALGORITHM_RIGHT_SYMMETRIC_6:
3118	pd_idx = sector_div(stripe2, raid_disks-`1`);
3119	dd_idx = (pd_idx + `1` + dd_idx) % (raid_disks-`1`);
3120	qd_idx = raid_disks - `1`;
3121	break;
3122
3123	case ALGORITHM_PARITY_0_6:
3124	pd_idx = `0`;
3125	(*dd_idx)++;
3126	qd_idx = raid_disks - `1`;
3127	break;
3128
3129	default:
3130	BUG();
3131	}
3132	break;
3133	}
3134
3135	if (sh) {
3136	sh->pd_idx = pd_idx;
3137	sh->qd_idx = qd_idx;
3138	sh->ddf_layout = ddf_layout;
3139	}
3140	/*
3141	* Finally, compute the new sector number
3142	*/
3143	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3144	return new_sector;
3145	}
3146
3147	sector_t raid5_compute_blocknr(struct stripe_head sh, int* i, int previous)
3148	{
3149	struct r5conf *conf = sh->raid_conf;
3150	int raid_disks = sh->disks;
3151	int data_disks = raid_disks - conf->max_degraded;
3152	sector_t new_sector = sh->sector, check;
3153	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3154	: conf->chunk_sectors;
3155	int algorithm = previous ? conf->prev_algo
3156	: conf->algorithm;
3157	sector_t stripe;
3158	int chunk_offset;
3159	sector_t chunk_number;
3160	int dummy1, dd_idx = i;
3161	sector_t r_sector;
3162	struct stripe_head sh2;
3163
3164	chunk_offset = sector_div(new_sector, sectors_per_chunk);
3165	stripe = new_sector;
3166
3167	if (i == sh->pd_idx)
3168	return `0`;
3169	switch(conf->level) {
3170	case `4`: break;
3171	case `5`:
3172	switch (algorithm) {
3173	case ALGORITHM_LEFT_ASYMMETRIC:
3174	case ALGORITHM_RIGHT_ASYMMETRIC:
3175	if (i > sh->pd_idx)
3176	i--;
3177	break;
3178	case ALGORITHM_LEFT_SYMMETRIC:
3179	case ALGORITHM_RIGHT_SYMMETRIC:
3180	if (i < sh->pd_idx)
3181	i += raid_disks;
3182	i -= (sh->pd_idx + `1`);
3183	break;
3184	case ALGORITHM_PARITY_0:
3185	i -= `1`;
3186	break;
3187	case ALGORITHM_PARITY_N:
3188	break;
3189	default:
3190	BUG();
3191	}
3192	break;
3193	case `6`:
3194	if (i == sh->qd_idx)
3195	return `0`; / It is the Q disk /
3196	switch (algorithm) {
3197	case ALGORITHM_LEFT_ASYMMETRIC:
3198	case ALGORITHM_RIGHT_ASYMMETRIC:
3199	case ALGORITHM_ROTATING_ZERO_RESTART:
3200	case ALGORITHM_ROTATING_N_RESTART:
3201	if (sh->pd_idx == raid_disks-`1`)
3202	i--; / Q D D D P /
3203	else if (i > sh->pd_idx)
3204	i -= `2`; / D D P Q D /
3205	break;
3206	case ALGORITHM_LEFT_SYMMETRIC:
3207	case ALGORITHM_RIGHT_SYMMETRIC:
3208	if (sh->pd_idx == raid_disks-`1`)
3209	i--; / Q D D D P /
3210	else {
3211	/ D D P Q D /
3212	if (i < sh->pd_idx)
3213	i += raid_disks;
3214	i -= (sh->pd_idx + `2`);
3215	}
3216	break;
3217	case ALGORITHM_PARITY_0:
3218	i -= `2`;
3219	break;
3220	case ALGORITHM_PARITY_N:
3221	break;
3222	case ALGORITHM_ROTATING_N_CONTINUE:
3223	/ Like left_symmetric, but P is before Q /
3224	if (sh->pd_idx == `0`)
3225	i--; / P D D D Q /
3226	else {
3227	/ D D Q P D /
3228	if (i < sh->pd_idx)
3229	i += raid_disks;
3230	i -= (sh->pd_idx + `1`);
3231	}
3232	break;
3233	case ALGORITHM_LEFT_ASYMMETRIC_6:
3234	case ALGORITHM_RIGHT_ASYMMETRIC_6:
3235	if (i > sh->pd_idx)
3236	i--;
3237	break;
3238	case ALGORITHM_LEFT_SYMMETRIC_6:
3239	case ALGORITHM_RIGHT_SYMMETRIC_6:
3240	if (i < sh->pd_idx)
3241	i += data_disks + `1`;
3242	i -= (sh->pd_idx + `1`);
3243	break;
3244	case ALGORITHM_PARITY_0_6:
3245	i -= `1`;
3246	break;
3247	default:
3248	BUG();
3249	}
3250	break;
3251	}
3252
3253	chunk_number = stripe * data_disks + i;
3254	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3255
3256	check = raid5_compute_sector(conf, r_sector,
3257	previous, dd_idx: &dummy1, sh: &sh2);
3258	if (check != sh->sector \|\| dummy1 != dd_idx \|\| sh2.pd_idx != sh->pd_idx
3259	\|\| sh2.qd_idx != sh->qd_idx) {
3260	pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3261	mdname(conf->mddev));
3262	return `0`;
3263	}
3264	return r_sector;
3265	}
3266
3267	/*
3268	* There are cases where we want handle_stripe_dirtying() and
3269	* schedule_reconstruction() to delay towrite to some dev of a stripe.
3270	*
3271	* This function checks whether we want to delay the towrite. Specifically,
3272	* we delay the towrite when:
3273	*
3274	* 1. degraded stripe has a non-overwrite to the missing dev, AND this
3275	* stripe has data in journal (for other devices).
3276	*
3277	* In this case, when reading data for the non-overwrite dev, it is
3278	* necessary to handle complex rmw of write back cache (prexor with
3279	* orig_page, and xor with page). To keep read path simple, we would
3280	* like to flush data in journal to RAID disks first, so complex rmw
3281	* is handled in the write patch (handle_stripe_dirtying).
3282	*
3283	* 2. when journal space is critical (R5C_LOG_CRITICAL=1)
3284	*
3285	* It is important to be able to flush all stripes in raid5-cache.
3286	* Therefore, we need reserve some space on the journal device for
3287	* these flushes. If flush operation includes pending writes to the
3288	* stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3289	* for the flush out. If we exclude these pending writes from flush
3290	* operation, we only need (conf->max_degraded + 1) pages per stripe.
3291	* Therefore, excluding pending writes in these cases enables more
3292	* efficient use of the journal device.
3293	*
3294	* Note: To make sure the stripe makes progress, we only delay
3295	* towrite for stripes with data already in journal (injournal > 0).
3296	* When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3297	* no_space_stripes list.
3298	*
3299	* 3. during journal failure
3300	* In journal failure, we try to flush all cached data to raid disks
3301	* based on data in stripe cache. The array is read-only to upper
3302	* layers, so we would skip all pending writes.
3303	*
3304	*/
3305	static inline bool delay_towrite(struct r5conf *conf,
3306	struct r5dev *dev,
3307	struct stripe_head_state *s)
3308	{
3309	/ case 1 above /
3310	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3311	!test_bit(R5_Insync, &dev->flags) && s->injournal)
3312	return true;
3313	/ case 2 above /
3314	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3315	s->injournal > `0`)
3316	return true;
3317	/ case 3 above /
3318	if (s->log_failed && s->injournal)
3319	return true;
3320	return false;
3321	}
3322
3323	static void
3324	schedule_reconstruction(struct stripe_head sh, struct* stripe_head_state *s,
3325	int rcw, int expand)
3326	{
3327	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3328	struct r5conf *conf = sh->raid_conf;
3329	int level = conf->level;
3330
3331	if (rcw) {
3332	/*
3333	* In some cases, handle_stripe_dirtying initially decided to
3334	* run rmw and allocates extra page for prexor. However, rcw is
3335	* cheaper later on. We need to free the extra page now,
3336	* because we won't be able to do that in ops_complete_prexor().
3337	*/
3338	r5c_release_extra_page(sh);
3339
3340	for (i = disks; i--; ) {
3341	struct r5dev *dev = &sh->dev[i];
3342
3343	if (dev->towrite && !delay_towrite(conf, dev, s)) {
3344	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3345	set_bit(nr: R5_Wantdrain, addr: &dev->flags);
3346	if (!expand)
3347	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3348	s->locked++;
3349	} else if (test_bit(R5_InJournal, &dev->flags)) {
3350	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3351	s->locked++;
3352	}
3353	}
3354	/ if we are not expanding this is a proper write request, and*
3355	* there will be bios with new data to be drained into the
3356	* stripe cache
3357	*/
3358	if (!expand) {
3359	if (!s->locked)
3360	/ False alarm, nothing to do /
3361	return;
3362	sh->reconstruct_state = reconstruct_state_drain_run;
3363	set_bit(nr: STRIPE_OP_BIODRAIN, addr: &s->ops_request);
3364	} else
3365	sh->reconstruct_state = reconstruct_state_run;
3366
3367	set_bit(nr: STRIPE_OP_RECONSTRUCT, addr: &s->ops_request);
3368
3369	if (s->locked + conf->max_degraded == disks)
3370	if (!test_and_set_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
3371	atomic_inc(v: &conf->pending_full_writes);
3372	} else {
3373	BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) \|\|
3374	test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3375	BUG_ON(level == `6` &&
3376	(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) \|\|
3377	test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3378
3379	for (i = disks; i--; ) {
3380	struct r5dev *dev = &sh->dev[i];
3381	if (i == pd_idx \|\| i == qd_idx)
3382	continue;
3383
3384	if (dev->towrite &&
3385	(test_bit(R5_UPTODATE, &dev->flags) \|\|
3386	test_bit(R5_Wantcompute, &dev->flags))) {
3387	set_bit(nr: R5_Wantdrain, addr: &dev->flags);
3388	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3389	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3390	s->locked++;
3391	} else if (test_bit(R5_InJournal, &dev->flags)) {
3392	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3393	s->locked++;
3394	}
3395	}
3396	if (!s->locked)
3397	/ False alarm - nothing to do /
3398	return;
3399	sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3400	set_bit(nr: STRIPE_OP_PREXOR, addr: &s->ops_request);
3401	set_bit(nr: STRIPE_OP_BIODRAIN, addr: &s->ops_request);
3402	set_bit(nr: STRIPE_OP_RECONSTRUCT, addr: &s->ops_request);
3403	}
3404
3405	/ keep the parity disk(s) locked while asynchronous operations*
3406	* are in flight
3407	*/
3408	set_bit(nr: R5_LOCKED, addr: &sh->dev[pd_idx].flags);
3409	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[pd_idx].flags);
3410	s->locked++;
3411
3412	if (level == `6`) {
3413	int qd_idx = sh->qd_idx;
3414	struct r5dev *dev = &sh->dev[qd_idx];
3415
3416	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3417	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3418	s->locked++;
3419	}
3420
3421	if (raid5_has_ppl(conf: sh->raid_conf) && sh->ppl_page &&
3422	test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3423	!test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3424	test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3425	set_bit(nr: STRIPE_OP_PARTIAL_PARITY, addr: &s->ops_request);
3426
3427	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3428	__func__, (unsigned long long)sh->sector,
3429	s->locked, s->ops_request);
3430	}
3431
3432	static bool stripe_bio_overlaps(struct stripe_head sh, struct* bio *bi,
3433	int dd_idx, int forwrite)
3434	{
3435	struct r5conf *conf = sh->raid_conf;
3436	struct bio **bip;
3437
3438	pr_debug("checking bi b#%llu to stripe s#%llu\n",
3439	bi->bi_iter.bi_sector, sh->sector);
3440
3441	/ Don't allow new IO added to stripes in batch list /
3442	if (sh->batch_head)
3443	return true;
3444
3445	if (forwrite)
3446	bip = &sh->dev[dd_idx].towrite;
3447	else
3448	bip = &sh->dev[dd_idx].toread;
3449
3450	while (bip && (bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3451	if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3452	return true;
3453	bip = &(*bip)->bi_next;
3454	}
3455
3456	if (bip && (bip)->bi_iter.bi_sector < bio_end_sector(bi))
3457	return true;
3458
3459	if (forwrite && raid5_has_ppl(conf)) {
3460	/*
3461	* With PPL only writes to consecutive data chunks within a
3462	* stripe are allowed because for a single stripe_head we can
3463	* only have one PPL entry at a time, which describes one data
3464	* range. Not really an overlap, but R5_Overlap can be
3465	* used to handle this.
3466	*/
3467	sector_t sector;
3468	sector_t first = `0`;
3469	sector_t last = `0`;
3470	int count = `0`;
3471	int i;
3472
3473	for (i = `0`; i < sh->disks; i++) {
3474	if (i != sh->pd_idx &&
3475	(i == dd_idx \|\| sh->dev[i].towrite)) {
3476	sector = sh->dev[i].sector;
3477	if (count == `0` \|\| sector < first)
3478	first = sector;
3479	if (sector > last)
3480	last = sector;
3481	count++;
3482	}
3483	}
3484
3485	if (first + conf->chunk_sectors * (count - `1`) != last)
3486	return true;
3487	}
3488
3489	return false;
3490	}
3491
3492	static void __add_stripe_bio(struct stripe_head sh, struct* bio *bi,
3493	int dd_idx, int forwrite, int previous)
3494	{
3495	struct r5conf *conf = sh->raid_conf;
3496	struct bio **bip;
3497	int firstwrite = `0`;
3498
3499	if (forwrite) {
3500	bip = &sh->dev[dd_idx].towrite;
3501	if (!*bip)
3502	firstwrite = `1`;
3503	} else {
3504	bip = &sh->dev[dd_idx].toread;
3505	}
3506
3507	while (bip && (bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
3508	bip = &(*bip)->bi_next;
3509
3510	if (!forwrite \|\| previous)
3511	clear_bit(nr: STRIPE_BATCH_READY, addr: &sh->state);
3512
3513	BUG_ON(bip && bi->bi_next && (bip) != bi->bi_next);
3514	if (*bip)
3515	bi->bi_next = *bip;
3516	*bip = bi;
3517	bio_inc_remaining(bio: bi);
3518	md_write_inc(mddev: conf->mddev, bi);
3519
3520	if (forwrite) {
3521	/ check if page is covered /
3522	sector_t sector = sh->dev[dd_idx].sector;
3523	for (bi=sh->dev[dd_idx].towrite;
3524	sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3525	bi && bi->bi_iter.bi_sector <= sector;
3526	bi = r5_next_bio(conf, bio: bi, sector: sh->dev[dd_idx].sector)) {
3527	if (bio_end_sector(bi) >= sector)
3528	sector = bio_end_sector(bi);
3529	}
3530	if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3531	if (!test_and_set_bit(nr: R5_OVERWRITE, addr: &sh->dev[dd_idx].flags))
3532	sh->overwrite_disks++;
3533	}
3534
3535	pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
3536	(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
3537	sh->dev[dd_idx].sector);
3538
3539	if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
3540	sh->bm_seq = conf->seq_flush+`1`;
3541	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
3542	}
3543	}
3544
3545	/*
3546	* Each stripe/dev can have one or more bios attached.
3547	* toread/towrite point to the first in a chain.
3548	* The bi_next chain must be in order.
3549	*/
3550	static bool add_stripe_bio(struct stripe_head sh, struct* bio *bi,
3551	int dd_idx, int forwrite, int previous)
3552	{
3553	spin_lock_irq(lock: &sh->stripe_lock);
3554
3555	if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
3556	set_bit(nr: R5_Overlap, addr: &sh->dev[dd_idx].flags);
3557	spin_unlock_irq(lock: &sh->stripe_lock);
3558	return false;
3559	}
3560
3561	__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
3562	spin_unlock_irq(lock: &sh->stripe_lock);
3563	return true;
3564	}
3565
3566	static void end_reshape(struct r5conf *conf);
3567
3568	static void stripe_set_idx(sector_t stripe, struct r5conf conf, int* previous,
3569	struct stripe_head *sh)
3570	{
3571	int sectors_per_chunk =
3572	previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3573	int dd_idx;
3574	int chunk_offset = sector_div(stripe, sectors_per_chunk);
3575	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3576
3577	raid5_compute_sector(conf,
3578	r_sector: stripe * (disks - conf->max_degraded)
3579	*sectors_per_chunk + chunk_offset,
3580	previous,
3581	dd_idx: &dd_idx, sh);
3582	}
3583
3584	static void
3585	handle_failed_stripe(struct r5conf conf, struct* stripe_head *sh,
3586	struct stripe_head_state s, int* disks)
3587	{
3588	int i;
3589	BUG_ON(sh->batch_head);
3590	for (i = disks; i--; ) {
3591	struct bio *bi;
3592
3593	if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3594	struct md_rdev *rdev = conf->disks[i].rdev;
3595
3596	if (rdev && test_bit(In_sync, &rdev->flags) &&
3597	!test_bit(Faulty, &rdev->flags))
3598	atomic_inc(v: &rdev->nr_pending);
3599	else
3600	rdev = NULL;
3601	if (rdev) {
3602	if (!rdev_set_badblocks(
3603	rdev,
3604	s: sh->sector,
3605	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3606	md_error(mddev: conf->mddev, rdev);
3607	rdev_dec_pending(rdev, mddev: conf->mddev);
3608	}
3609	}
3610	spin_lock_irq(lock: &sh->stripe_lock);
3611	/ fail all writes first /
3612	bi = sh->dev[i].towrite;
3613	sh->dev[i].towrite = NULL;
3614	sh->overwrite_disks = `0`;
3615	spin_unlock_irq(lock: &sh->stripe_lock);
3616
3617	log_stripe_write_finished(sh);
3618
3619	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
3620	wake_up_bit(word: &sh->dev[i].flags, bit: R5_Overlap);
3621
3622	while (bi && bi->bi_iter.bi_sector <
3623	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3624	struct bio *nextbi = r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3625
3626	md_write_end(mddev: conf->mddev);
3627	bio_io_error(bio: bi);
3628	bi = nextbi;
3629	}
3630	/ and fail all 'written' /
3631	bi = sh->dev[i].written;
3632	sh->dev[i].written = NULL;
3633	if (test_and_clear_bit(nr: R5_SkipCopy, addr: &sh->dev[i].flags)) {
3634	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3635	sh->dev[i].page = sh->dev[i].orig_page;
3636	}
3637
3638	while (bi && bi->bi_iter.bi_sector <
3639	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3640	struct bio *bi2 = r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3641
3642	md_write_end(mddev: conf->mddev);
3643	bio_io_error(bio: bi);
3644	bi = bi2;
3645	}
3646
3647	/ fail any reads if this device is non-operational and*
3648	* the data has not reached the cache yet.
3649	*/
3650	if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3651	s->failed > conf->max_degraded &&
3652	(!test_bit(R5_Insync, &sh->dev[i].flags) \|\|
3653	test_bit(R5_ReadError, &sh->dev[i].flags))) {
3654	spin_lock_irq(lock: &sh->stripe_lock);
3655	bi = sh->dev[i].toread;
3656	sh->dev[i].toread = NULL;
3657	spin_unlock_irq(lock: &sh->stripe_lock);
3658	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
3659	wake_up_bit(word: &sh->dev[i].flags, bit: R5_Overlap);
3660	if (bi)
3661	s->to_read--;
3662	while (bi && bi->bi_iter.bi_sector <
3663	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3664	struct bio *nextbi =
3665	r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3666
3667	bio_io_error(bio: bi);
3668	bi = nextbi;
3669	}
3670	}
3671	/ If we were in the middle of a write the parity block might*
3672	* still be locked - so just clear all R5_LOCKED flags
3673	*/
3674	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
3675	}
3676	s->to_write = `0`;
3677	s->written = `0`;
3678
3679	if (test_and_clear_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
3680	if (atomic_dec_and_test(v: &conf->pending_full_writes))
3681	md_wakeup_thread(conf->mddev->thread);
3682	}
3683
3684	static void
3685	handle_failed_sync(struct r5conf conf, struct* stripe_head *sh,
3686	struct stripe_head_state *s)
3687	{
3688	int abort = `0`;
3689	int i;
3690
3691	BUG_ON(sh->batch_head);
3692	clear_bit(nr: STRIPE_SYNCING, addr: &sh->state);
3693	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags))
3694	wake_up_bit(word: &sh->dev[sh->pd_idx].flags, bit: R5_Overlap);
3695	s->syncing = `0`;
3696	s->replacing = `0`;
3697	/ There is nothing more to do for sync/check/repair.*
3698	* Don't even need to abort as that is handled elsewhere
3699	* if needed, and not always wanted e.g. if there is a known
3700	* bad block here.
3701	* For recover/replace we need to record a bad block on all
3702	* non-sync devices, or abort the recovery
3703	*/
3704	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3705	/ During recovery devices cannot be removed, so*
3706	* locking and refcounting of rdevs is not needed
3707	*/
3708	for (i = `0`; i < conf->raid_disks; i++) {
3709	struct md_rdev *rdev = conf->disks[i].rdev;
3710
3711	if (rdev
3712	&& !test_bit(Faulty, &rdev->flags)
3713	&& !test_bit(In_sync, &rdev->flags)
3714	&& !rdev_set_badblocks(rdev, s: sh->sector,
3715	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3716	abort = `1`;
3717	rdev = conf->disks[i].replacement;
3718
3719	if (rdev
3720	&& !test_bit(Faulty, &rdev->flags)
3721	&& !test_bit(In_sync, &rdev->flags)
3722	&& !rdev_set_badblocks(rdev, s: sh->sector,
3723	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3724	abort = `1`;
3725	}
3726	if (abort)
3727	conf->recovery_disabled =
3728	conf->mddev->recovery_disabled;
3729	}
3730	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: !abort);
3731	}
3732
3733	static int want_replace(struct stripe_head sh, int* disk_idx)
3734	{
3735	struct md_rdev *rdev;
3736	int rv = `0`;
3737
3738	rdev = sh->raid_conf->disks[disk_idx].replacement;
3739	if (rdev
3740	&& !test_bit(Faulty, &rdev->flags)
3741	&& !test_bit(In_sync, &rdev->flags)
3742	&& (rdev->recovery_offset <= sh->sector
3743	\|\| rdev->mddev->resync_offset <= sh->sector))
3744	rv = `1`;
3745	return rv;
3746	}
3747
3748	static int need_this_block(struct stripe_head sh, struct* stripe_head_state *s,
3749	int disk_idx, int disks)
3750	{
3751	struct r5dev *dev = &sh->dev[disk_idx];
3752	struct r5dev *fdev[`2`] = { &sh->dev[s->failed_num[`0`]],
3753	&sh->dev[s->failed_num[`1`]] };
3754	int i;
3755	bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3756
3757
3758	if (test_bit(R5_LOCKED, &dev->flags) \|\|
3759	test_bit(R5_UPTODATE, &dev->flags))
3760	/ No point reading this as we already have it or have*
3761	* decided to get it.
3762	*/
3763	return `0`;
3764
3765	if (dev->toread \|\|
3766	(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3767	/ We need this block to directly satisfy a request /
3768	return `1`;
3769
3770	if (s->syncing \|\| s->expanding \|\|
3771	(s->replacing && want_replace(sh, disk_idx)))
3772	/ When syncing, or expanding we read everything.*
3773	* When replacing, we need the replaced block.
3774	*/
3775	return `1`;
3776
3777	if ((s->failed >= `1` && fdev[`0`]->toread) \|\|
3778	(s->failed >= `2` && fdev[`1`]->toread))
3779	/ If we want to read from a failed device, then*
3780	* we need to actually read every other device.
3781	*/
3782	return `1`;
3783
3784	/ Sometimes neither read-modify-write nor reconstruct-write*
3785	* cycles can work. In those cases we read every block we
3786	* can. Then the parity-update is certain to have enough to
3787	* work with.
3788	* This can only be a problem when we need to write something,
3789	* and some device has failed. If either of those tests
3790	* fail we need look no further.
3791	*/
3792	if (!s->failed \|\| !s->to_write)
3793	return `0`;
3794
3795	if (test_bit(R5_Insync, &dev->flags) &&
3796	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3797	/ Pre-reads at not permitted until after short delay*
3798	* to gather multiple requests. However if this
3799	* device is no Insync, the block could only be computed
3800	* and there is no need to delay that.
3801	*/
3802	return `0`;
3803
3804	for (i = `0`; i < s->failed && i < `2`; i++) {
3805	if (fdev[i]->towrite &&
3806	!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3807	!test_bit(R5_OVERWRITE, &fdev[i]->flags))
3808	/ If we have a partial write to a failed*
3809	* device, then we will need to reconstruct
3810	* the content of that device, so all other
3811	* devices must be read.
3812	*/
3813	return `1`;
3814
3815	if (s->failed >= `2` &&
3816	(fdev[i]->towrite \|\|
3817	s->failed_num[i] == sh->pd_idx \|\|
3818	s->failed_num[i] == sh->qd_idx) &&
3819	!test_bit(R5_UPTODATE, &fdev[i]->flags))
3820	/ In max degraded raid6, If the failed disk is P, Q,*
3821	* or we want to read the failed disk, we need to do
3822	* reconstruct-write.
3823	*/
3824	force_rcw = true;
3825	}
3826
3827	/ If we are forced to do a reconstruct-write, because parity*
3828	* cannot be trusted and we are currently recovering it, there
3829	* is extra need to be careful.
3830	* If one of the devices that we would need to read, because
3831	* it is not being overwritten (and maybe not written at all)
3832	* is missing/faulty, then we need to read everything we can.
3833	*/
3834	if (!force_rcw &&
3835	sh->sector < sh->raid_conf->mddev->resync_offset)
3836	/ reconstruct-write isn't being forced /
3837	return `0`;
3838	for (i = `0`; i < s->failed && i < `2`; i++) {
3839	if (s->failed_num[i] != sh->pd_idx &&
3840	s->failed_num[i] != sh->qd_idx &&
3841	!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3842	!test_bit(R5_OVERWRITE, &fdev[i]->flags))
3843	return `1`;
3844	}
3845
3846	return `0`;
3847	}
3848
3849	/ fetch_block - checks the given member device to see if its data needs*
3850	* to be read or computed to satisfy a request.
3851	*
3852	* Returns 1 when no more member devices need to be checked, otherwise returns
3853	* 0 to tell the loop in handle_stripe_fill to continue
3854	*/
3855	static int fetch_block(struct stripe_head sh, struct* stripe_head_state *s,
3856	int disk_idx, int disks)
3857	{
3858	struct r5dev *dev = &sh->dev[disk_idx];
3859
3860	/ is the data in this block needed, and can we get it? /
3861	if (need_this_block(sh, s, disk_idx, disks)) {
3862	/ we would like to get this block, possibly by computing it,*
3863	* otherwise read it if the backing disk is insync
3864	*/
3865	BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3866	BUG_ON(test_bit(R5_Wantread, &dev->flags));
3867	BUG_ON(sh->batch_head);
3868
3869	/*
3870	* In the raid6 case if the only non-uptodate disk is P
3871	* then we already trusted P to compute the other failed
3872	* drives. It is safe to compute rather than re-read P.
3873	* In other cases we only compute blocks from failed
3874	* devices, otherwise check/repair might fail to detect
3875	* a real inconsistency.
3876	*/
3877
3878	if ((s->uptodate == disks - `1`) &&
3879	((sh->qd_idx >= `0` && sh->pd_idx == disk_idx) \|\|
3880	(s->failed && (disk_idx == s->failed_num[`0`] \|\|
3881	disk_idx == s->failed_num[`1`])))) {
3882	/ have disk failed, and we're requested to fetch it;*
3883	* do compute it
3884	*/
3885	pr_debug("Computing stripe %llu block %d\n",
3886	(unsigned long long)sh->sector, disk_idx);
3887	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
3888	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
3889	set_bit(nr: R5_Wantcompute, addr: &dev->flags);
3890	sh->ops.target = disk_idx;
3891	sh->ops.target2 = -`1`; / no 2nd target /
3892	s->req_compute = `1`;
3893	/ Careful: from this point on 'uptodate' is in the eye*
3894	* of raid_run_ops which services 'compute' operations
3895	* before writes. R5_Wantcompute flags a block that will
3896	* be R5_UPTODATE by the time it is needed for a
3897	* subsequent operation.
3898	*/
3899	s->uptodate++;
3900	return `1`;
3901	} else if (s->uptodate == disks-`2` && s->failed >= `2`) {
3902	/ Computing 2-failure is very expensive; only*
3903	* do it if failed >= 2
3904	*/
3905	int other;
3906	for (other = disks; other--; ) {
3907	if (other == disk_idx)
3908	continue;
3909	if (!test_bit(R5_UPTODATE,
3910	&sh->dev[other].flags))
3911	break;
3912	}
3913	BUG_ON(other < `0`);
3914	pr_debug("Computing stripe %llu blocks %d,%d\n",
3915	(unsigned long long)sh->sector,
3916	disk_idx, other);
3917	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
3918	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
3919	set_bit(nr: R5_Wantcompute, addr: &sh->dev[disk_idx].flags);
3920	set_bit(nr: R5_Wantcompute, addr: &sh->dev[other].flags);
3921	sh->ops.target = disk_idx;
3922	sh->ops.target2 = other;
3923	s->uptodate += `2`;
3924	s->req_compute = `1`;
3925	return `1`;
3926	} else if (test_bit(R5_Insync, &dev->flags)) {
3927	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3928	set_bit(nr: R5_Wantread, addr: &dev->flags);
3929	s->locked++;
3930	pr_debug("Reading block %d (sync=%d)\n",
3931	disk_idx, s->syncing);
3932	}
3933	}
3934
3935	return `0`;
3936	}
3937
3938	/*
3939	* handle_stripe_fill - read or compute data to satisfy pending requests.
3940	*/
3941	static void handle_stripe_fill(struct stripe_head *sh,
3942	struct stripe_head_state *s,
3943	int disks)
3944	{
3945	int i;
3946
3947	/ look for blocks to read/compute, skip this if a compute*
3948	* is already in flight, or if the stripe contents are in the
3949	* midst of changing due to a write
3950	*/
3951	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3952	!sh->reconstruct_state) {
3953
3954	/*
3955	* For degraded stripe with data in journal, do not handle
3956	* read requests yet, instead, flush the stripe to raid
3957	* disks first, this avoids handling complex rmw of write
3958	* back cache (prexor with orig_page, and then xor with
3959	* page) in the read path
3960	*/
3961	if (s->to_read && s->injournal && s->failed) {
3962	if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3963	r5c_make_stripe_write_out(sh);
3964	goto out;
3965	}
3966
3967	for (i = disks; i--; )
3968	if (fetch_block(sh, s, disk_idx: i, disks))
3969	break;
3970	}
3971	out:
3972	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
3973	}
3974
3975	static void break_stripe_batch_list(struct stripe_head *head_sh,
3976	unsigned long handle_flags);
3977	/ handle_stripe_clean_event*
3978	* any written block on an uptodate or failed drive can be returned.
3979	* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
3980	* never LOCKED, so we don't need to test 'failed' directly.
3981	*/
3982	static void handle_stripe_clean_event(struct r5conf *conf,
3983	struct stripe_head sh, int* disks)
3984	{
3985	int i;
3986	struct r5dev *dev;
3987	int discard_pending = `0`;
3988	struct stripe_head *head_sh = sh;
3989	bool do_endio = false;
3990
3991	for (i = disks; i--; )
3992	if (sh->dev[i].written) {
3993	dev = &sh->dev[i];
3994	if (!test_bit(R5_LOCKED, &dev->flags) &&
3995	(test_bit(R5_UPTODATE, &dev->flags) \|\|
3996	test_bit(R5_Discard, &dev->flags) \|\|
3997	test_bit(R5_SkipCopy, &dev->flags))) {
3998	/ We can return any write requests /
3999	struct bio wbi, wbi2;
4000	pr_debug("Return write for disc %d\n", i);
4001	if (test_and_clear_bit(nr: R5_Discard, addr: &dev->flags))
4002	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
4003	if (test_and_clear_bit(nr: R5_SkipCopy, addr: &dev->flags)) {
4004	WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
4005	}
4006	do_endio = true;
4007
4008	returnbi:
4009	dev->page = dev->orig_page;
4010	wbi = dev->written;
4011	dev->written = NULL;
4012	while (wbi && wbi->bi_iter.bi_sector <
4013	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
4014	wbi2 = r5_next_bio(conf, bio: wbi, sector: dev->sector);
4015	md_write_end(mddev: conf->mddev);
4016	bio_endio(wbi);
4017	wbi = wbi2;
4018	}
4019
4020	if (head_sh->batch_head) {
4021	sh = list_first_entry(&sh->batch_list,
4022	struct stripe_head,
4023	batch_list);
4024	if (sh != head_sh) {
4025	dev = &sh->dev[i];
4026	goto returnbi;
4027	}
4028	}
4029	sh = head_sh;
4030	dev = &sh->dev[i];
4031	} else if (test_bit(R5_Discard, &dev->flags))
4032	discard_pending = `1`;
4033	}
4034
4035	log_stripe_write_finished(sh);
4036
4037	if (!discard_pending &&
4038	test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4039	int hash;
4040	clear_bit(nr: R5_Discard, addr: &sh->dev[sh->pd_idx].flags);
4041	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->pd_idx].flags);
4042	if (sh->qd_idx >= `0`) {
4043	clear_bit(nr: R5_Discard, addr: &sh->dev[sh->qd_idx].flags);
4044	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->qd_idx].flags);
4045	}
4046	/ now that discard is done we can proceed with any sync /
4047	clear_bit(nr: STRIPE_DISCARD, addr: &sh->state);
4048	/*
4049	* SCSI discard will change some bio fields and the stripe has
4050	* no updated data, so remove it from hash list and the stripe
4051	* will be reinitialized
4052	*/
4053	unhash:
4054	hash = sh->hash_lock_index;
4055	spin_lock_irq(lock: conf->hash_locks + hash);
4056	remove_hash(sh);
4057	spin_unlock_irq(lock: conf->hash_locks + hash);
4058	if (head_sh->batch_head) {
4059	sh = list_first_entry(&sh->batch_list,
4060	struct stripe_head, batch_list);
4061	if (sh != head_sh)
4062	goto unhash;
4063	}
4064	sh = head_sh;
4065
4066	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4067	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4068
4069	}
4070
4071	if (test_and_clear_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
4072	if (atomic_dec_and_test(v: &conf->pending_full_writes))
4073	md_wakeup_thread(conf->mddev->thread);
4074
4075	if (head_sh->batch_head && do_endio)
4076	break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4077	}
4078
4079	/*
4080	* For RMW in write back cache, we need extra page in prexor to store the
4081	* old data. This page is stored in dev->orig_page.
4082	*
4083	* This function checks whether we have data for prexor. The exact logic
4084	* is:
4085	* R5_UPTODATE && (!R5_InJournal \|\| R5_OrigPageUPTDODATE)
4086	*/
4087	static inline bool uptodate_for_rmw(struct r5dev *dev)
4088	{
4089	return (test_bit(R5_UPTODATE, &dev->flags)) &&
4090	(!test_bit(R5_InJournal, &dev->flags) \|\|
4091	test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4092	}
4093
4094	static int handle_stripe_dirtying(struct r5conf *conf,
4095	struct stripe_head *sh,
4096	struct stripe_head_state *s,
4097	int disks)
4098	{
4099	int rmw = `0`, rcw = `0`, i;
4100	struct mddev *mddev = conf->mddev;
4101	sector_t resync_offset = mddev->resync_offset;
4102
4103	/ Check whether resync is now happening or should start.*
4104	* If yes, then the array is dirty (after unclean shutdown or
4105	* initial creation), so parity in some stripes might be inconsistent.
4106	* In this case, we need to always do reconstruct-write, to ensure
4107	* that in case of drive failure or read-error correction, we
4108	* generate correct data from the parity.
4109	*/
4110	if (conf->rmw_level == PARITY_DISABLE_RMW \|\|
4111	(resync_offset < MaxSector && sh->sector >= resync_offset &&
4112	s->failed == `0`)) {
4113	/ Calculate the real rcw later - for now make it*
4114	* look like rcw is cheaper
4115	*/
4116	rcw = `1`; rmw = `2`;
4117	pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n",
4118	conf->rmw_level, (unsigned long long)resync_offset,
4119	(unsigned long long)sh->sector);
4120	} else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced &&
4121	!mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) {
4122	/ The initial recover is not done, must read everything /
4123	rcw = `1`; rmw = `2`;
4124	pr_debug("force RCW by lazy recovery, sh->sector=%llu\n",
4125	sh->sector);
4126	} else for (i = disks; i--; ) {
4127	/ would I have to read this buffer for read_modify_write /
4128	struct r5dev *dev = &sh->dev[i];
4129	if (((dev->towrite && !delay_towrite(conf, dev, s)) \|\|
4130	i == sh->pd_idx \|\| i == sh->qd_idx \|\|
4131	test_bit(R5_InJournal, &dev->flags)) &&
4132	!test_bit(R5_LOCKED, &dev->flags) &&
4133	!(uptodate_for_rmw(dev) \|\|
4134	test_bit(R5_Wantcompute, &dev->flags))) {
4135	if (test_bit(R5_Insync, &dev->flags))
4136	rmw++;
4137	else
4138	rmw += `2`disks; /* cannot read it /
4139	}
4140	/ Would I have to read this buffer for reconstruct_write /
4141	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4142	i != sh->pd_idx && i != sh->qd_idx &&
4143	!test_bit(R5_LOCKED, &dev->flags) &&
4144	!(test_bit(R5_UPTODATE, &dev->flags) \|\|
4145	test_bit(R5_Wantcompute, &dev->flags))) {
4146	if (test_bit(R5_Insync, &dev->flags))
4147	rcw++;
4148	else
4149	rcw += `2`*disks;
4150	}
4151	}
4152
4153	pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4154	(unsigned long long)sh->sector, sh->state, rmw, rcw);
4155	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4156	if ((rmw < rcw \|\| (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > `0`) {
4157	/ prefer read-modify-write, but need to get some data /
4158	mddev_add_trace_msg(mddev, "raid5 rmw %llu %d",
4159	sh->sector, rmw);
4160
4161	for (i = disks; i--; ) {
4162	struct r5dev *dev = &sh->dev[i];
4163	if (test_bit(R5_InJournal, &dev->flags) &&
4164	dev->page == dev->orig_page &&
4165	!test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4166	/ alloc page for prexor /
4167	struct page *p = alloc_page(GFP_NOIO);
4168
4169	if (p) {
4170	dev->orig_page = p;
4171	continue;
4172	}
4173
4174	/*
4175	* alloc_page() failed, try use
4176	* disk_info->extra_page
4177	*/
4178	if (!test_and_set_bit(nr: R5C_EXTRA_PAGE_IN_USE,
4179	addr: &conf->cache_state)) {
4180	r5c_use_extra_page(sh);
4181	break;
4182	}
4183
4184	/ extra_page in use, add to delayed_list /
4185	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4186	s->waiting_extra_page = `1`;
4187	return -EAGAIN;
4188	}
4189	}
4190
4191	for (i = disks; i--; ) {
4192	struct r5dev *dev = &sh->dev[i];
4193	if (((dev->towrite && !delay_towrite(conf, dev, s)) \|\|
4194	i == sh->pd_idx \|\| i == sh->qd_idx \|\|
4195	test_bit(R5_InJournal, &dev->flags)) &&
4196	!test_bit(R5_LOCKED, &dev->flags) &&
4197	!(uptodate_for_rmw(dev) \|\|
4198	test_bit(R5_Wantcompute, &dev->flags)) &&
4199	test_bit(R5_Insync, &dev->flags)) {
4200	if (test_bit(STRIPE_PREREAD_ACTIVE,
4201	&sh->state)) {
4202	pr_debug("Read_old block %d for r-m-w\n",
4203	i);
4204	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4205	set_bit(nr: R5_Wantread, addr: &dev->flags);
4206	s->locked++;
4207	} else
4208	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4209	}
4210	}
4211	}
4212	if ((rcw < rmw \|\| (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > `0`) {
4213	/ want reconstruct write, but need to get some data /
4214	int qread =`0`;
4215	rcw = `0`;
4216	for (i = disks; i--; ) {
4217	struct r5dev *dev = &sh->dev[i];
4218	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4219	i != sh->pd_idx && i != sh->qd_idx &&
4220	!test_bit(R5_LOCKED, &dev->flags) &&
4221	!(test_bit(R5_UPTODATE, &dev->flags) \|\|
4222	test_bit(R5_Wantcompute, &dev->flags))) {
4223	rcw++;
4224	if (test_bit(R5_Insync, &dev->flags) &&
4225	test_bit(STRIPE_PREREAD_ACTIVE,
4226	&sh->state)) {
4227	pr_debug("Read_old block "
4228	"%d for Reconstruct\n", i);
4229	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4230	set_bit(nr: R5_Wantread, addr: &dev->flags);
4231	s->locked++;
4232	qread++;
4233	} else
4234	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4235	}
4236	}
4237	if (rcw && !mddev_is_dm(mddev))
4238	blk_add_trace_msg(mddev->gendisk->queue,
4239	"raid5 rcw %llu %d %d %d",
4240	(unsigned long long)sh->sector, rcw, qread,
4241	test_bit(STRIPE_DELAYED, &sh->state));
4242	}
4243
4244	if (rcw > disks && rmw > disks &&
4245	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4246	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4247
4248	/ now if nothing is locked, and if we have enough data,*
4249	* we can start a write request
4250	*/
4251	/ since handle_stripe can be called at any time we need to handle the*
4252	* case where a compute block operation has been submitted and then a
4253	* subsequent call wants to start a write request. raid_run_ops only
4254	* handles the case where compute block and reconstruct are requested
4255	* simultaneously. If this is not the case then new writes need to be
4256	* held off until the compute completes.
4257	*/
4258	if ((s->req_compute \|\| !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4259	(s->locked == `0` && (rcw == `0` \|\| rmw == `0`) &&
4260	!test_bit(STRIPE_BIT_DELAY, &sh->state)))
4261	schedule_reconstruction(sh, s, rcw: rcw == `0`, expand: `0`);
4262	return `0`;
4263	}
4264
4265	static void handle_parity_checks5(struct r5conf conf, struct* stripe_head *sh,
4266	struct stripe_head_state s, int* disks)
4267	{
4268	struct r5dev *dev = NULL;
4269
4270	BUG_ON(sh->batch_head);
4271	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4272
4273	switch (sh->check_state) {
4274	case check_state_idle:
4275	/ start a new check operation if there are no failures /
4276	if (s->failed == `0`) {
4277	BUG_ON(s->uptodate != disks);
4278	sh->check_state = check_state_run;
4279	set_bit(nr: STRIPE_OP_CHECK, addr: &s->ops_request);
4280	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->pd_idx].flags);
4281	s->uptodate--;
4282	break;
4283	}
4284	dev = &sh->dev[s->failed_num[`0`]];
4285	fallthrough;
4286	case check_state_compute_result:
4287	sh->check_state = check_state_idle;
4288	if (!dev)
4289	dev = &sh->dev[sh->pd_idx];
4290
4291	/ check that a write has not made the stripe insync /
4292	if (test_bit(STRIPE_INSYNC, &sh->state))
4293	break;
4294
4295	/ either failed parity check, or recovery is happening /
4296	BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4297	BUG_ON(s->uptodate != disks);
4298
4299	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4300	s->locked++;
4301	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4302
4303	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4304	break;
4305	case check_state_run:
4306	break; / we will be called again upon completion /
4307	case check_state_check_result:
4308	sh->check_state = check_state_idle;
4309
4310	/ if a failure occurred during the check operation, leave*
4311	* STRIPE_INSYNC not set and let the stripe be handled again
4312	*/
4313	if (s->failed)
4314	break;
4315
4316	/ handle a successful check operation, if parity is correct*
4317	* we are done. Otherwise update the mismatch count and repair
4318	* parity if !MD_RECOVERY_CHECK
4319	*/
4320	if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == `0`)
4321	/ parity is correct (on disc,*
4322	* not in buffer any more)
4323	*/
4324	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4325	else {
4326	atomic64_add(RAID5_STRIPE_SECTORS(conf), v: &conf->mddev->resync_mismatches);
4327	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4328	/ don't try to repair!! /
4329	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4330	pr_warn_ratelimited("%s: mismatch sector in range "
4331	"%llu-%llu\n", mdname(conf->mddev),
4332	(unsigned long long) sh->sector,
4333	(unsigned long long) sh->sector +
4334	RAID5_STRIPE_SECTORS(conf));
4335	} else {
4336	sh->check_state = check_state_compute_run;
4337	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
4338	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
4339	set_bit(nr: R5_Wantcompute,
4340	addr: &sh->dev[sh->pd_idx].flags);
4341	sh->ops.target = sh->pd_idx;
4342	sh->ops.target2 = -`1`;
4343	s->uptodate++;
4344	}
4345	}
4346	break;
4347	case check_state_compute_run:
4348	break;
4349	default:
4350	pr_err("%s: unknown check_state: %d sector: %llu\n",
4351	__func__, sh->check_state,
4352	(unsigned long long) sh->sector);
4353	BUG();
4354	}
4355	}
4356
4357	static void handle_parity_checks6(struct r5conf conf, struct* stripe_head *sh,
4358	struct stripe_head_state *s,
4359	int disks)
4360	{
4361	int pd_idx = sh->pd_idx;
4362	int qd_idx = sh->qd_idx;
4363	struct r5dev *dev;
4364
4365	BUG_ON(sh->batch_head);
4366	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4367
4368	BUG_ON(s->failed > `2`);
4369
4370	/ Want to check and possibly repair P and Q.*
4371	* However there could be one 'failed' device, in which
4372	* case we can only check one of them, possibly using the
4373	* other to generate missing data
4374	*/
4375
4376	switch (sh->check_state) {
4377	case check_state_idle:
4378	/ start a new check operation if there are < 2 failures /
4379	if (s->failed == s->q_failed) {
4380	/ The only possible failed device holds Q, so it*
4381	* makes sense to check P (If anything else were failed,
4382	* we would have used P to recreate it).
4383	*/
4384	sh->check_state = check_state_run;
4385	}
4386	if (!s->q_failed && s->failed < `2`) {
4387	/ Q is not failed, and we didn't use it to generate*
4388	* anything, so it makes sense to check it
4389	*/
4390	if (sh->check_state == check_state_run)
4391	sh->check_state = check_state_run_pq;
4392	else
4393	sh->check_state = check_state_run_q;
4394	}
4395
4396	/ discard potentially stale zero_sum_result /
4397	sh->ops.zero_sum_result = `0`;
4398
4399	if (sh->check_state == check_state_run) {
4400	/ async_xor_zero_sum destroys the contents of P /
4401	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[pd_idx].flags);
4402	s->uptodate--;
4403	}
4404	if (sh->check_state >= check_state_run &&
4405	sh->check_state <= check_state_run_pq) {
4406	/ async_syndrome_zero_sum preserves P and Q, so*
4407	* no need to mark them !uptodate here
4408	*/
4409	set_bit(nr: STRIPE_OP_CHECK, addr: &s->ops_request);
4410	break;
4411	}
4412
4413	/ we have 2-disk failure /
4414	BUG_ON(s->failed != `2`);
4415	fallthrough;
4416	case check_state_compute_result:
4417	sh->check_state = check_state_idle;
4418
4419	/ check that a write has not made the stripe insync /
4420	if (test_bit(STRIPE_INSYNC, &sh->state))
4421	break;
4422
4423	/ now write out any block on a failed drive,*
4424	* or P or Q if they were recomputed
4425	*/
4426	dev = NULL;
4427	if (s->failed == `2`) {
4428	dev = &sh->dev[s->failed_num[`1`]];
4429	s->locked++;
4430	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4431	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4432	}
4433	if (s->failed >= `1`) {
4434	dev = &sh->dev[s->failed_num[`0`]];
4435	s->locked++;
4436	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4437	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4438	}
4439	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4440	dev = &sh->dev[pd_idx];
4441	s->locked++;
4442	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4443	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4444	}
4445	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4446	dev = &sh->dev[qd_idx];
4447	s->locked++;
4448	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4449	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4450	}
4451	if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4452	"%s: disk%td not up to date\n",
4453	mdname(conf->mddev),
4454	dev - (struct r5dev *) &sh->dev)) {
4455	clear_bit(nr: R5_LOCKED, addr: &dev->flags);
4456	clear_bit(nr: R5_Wantwrite, addr: &dev->flags);
4457	s->locked--;
4458	}
4459
4460	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4461	break;
4462	case check_state_run:
4463	case check_state_run_q:
4464	case check_state_run_pq:
4465	break; / we will be called again upon completion /
4466	case check_state_check_result:
4467	sh->check_state = check_state_idle;
4468
4469	/ handle a successful check operation, if parity is correct*
4470	* we are done. Otherwise update the mismatch count and repair
4471	* parity if !MD_RECOVERY_CHECK
4472	*/
4473	if (sh->ops.zero_sum_result == `0`) {
4474	/ both parities are correct /
4475	if (!s->failed)
4476	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4477	else {
4478	/ in contrast to the raid5 case we can validate*
4479	* parity, but still have a failure to write
4480	* back
4481	*/
4482	sh->check_state = check_state_compute_result;
4483	/ Returning at this point means that we may go*
4484	* off and bring p and/or q uptodate again so
4485	* we make sure to check zero_sum_result again
4486	* to verify if p or q need writeback
4487	*/
4488	}
4489	} else {
4490	atomic64_add(RAID5_STRIPE_SECTORS(conf), v: &conf->mddev->resync_mismatches);
4491	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4492	/ don't try to repair!! /
4493	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4494	pr_warn_ratelimited("%s: mismatch sector in range "
4495	"%llu-%llu\n", mdname(conf->mddev),
4496	(unsigned long long) sh->sector,
4497	(unsigned long long) sh->sector +
4498	RAID5_STRIPE_SECTORS(conf));
4499	} else {
4500	int *target = &sh->ops.target;
4501
4502	sh->ops.target = -`1`;
4503	sh->ops.target2 = -`1`;
4504	sh->check_state = check_state_compute_run;
4505	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
4506	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
4507	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4508	set_bit(nr: R5_Wantcompute,
4509	addr: &sh->dev[pd_idx].flags);
4510	*target = pd_idx;
4511	target = &sh->ops.target2;
4512	s->uptodate++;
4513	}
4514	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4515	set_bit(nr: R5_Wantcompute,
4516	addr: &sh->dev[qd_idx].flags);
4517	*target = qd_idx;
4518	s->uptodate++;
4519	}
4520	}
4521	}
4522	break;
4523	case check_state_compute_run:
4524	break;
4525	default:
4526	pr_warn("%s: unknown check_state: %d sector: %llu\n",
4527	__func__, sh->check_state,
4528	(unsigned long long) sh->sector);
4529	BUG();
4530	}
4531	}
4532
4533	static void handle_stripe_expansion(struct r5conf conf, struct* stripe_head *sh)
4534	{
4535	int i;
4536
4537	/ We have read all the blocks in this stripe and now we need to*
4538	* copy some of them into a target stripe for expand.
4539	*/
4540	struct dma_async_tx_descriptor *tx = NULL;
4541	BUG_ON(sh->batch_head);
4542	clear_bit(nr: STRIPE_EXPAND_SOURCE, addr: &sh->state);
4543	for (i = `0`; i < sh->disks; i++)
4544	if (i != sh->pd_idx && i != sh->qd_idx) {
4545	int dd_idx, j;
4546	struct stripe_head *sh2;
4547	struct async_submit_ctl submit;
4548
4549	sector_t bn = raid5_compute_blocknr(sh, i, previous: `1`);
4550	sector_t s = raid5_compute_sector(conf, r_sector: bn, previous: `0`,
4551	dd_idx: &dd_idx, NULL);
4552	sh2 = raid5_get_active_stripe(conf, NULL, sector: s,
4553	R5_GAS_NOBLOCK \| R5_GAS_NOQUIESCE);
4554	if (sh2 == NULL)
4555	/ so far only the early blocks of this stripe*
4556	* have been requested. When later blocks
4557	* get requested, we will try again
4558	*/
4559	continue;
4560	if (!test_bit(STRIPE_EXPANDING, &sh2->state) \|\|
4561	test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4562	/ must have already done this block /
4563	raid5_release_stripe(sh: sh2);
4564	continue;
4565	}
4566
4567	/ place all the copies on one channel /
4568	init_async_submit(args: &submit, flags: `0`, tx, NULL, NULL, NULL);
4569	tx = async_memcpy(dest: sh2->dev[dd_idx].page,
4570	src: sh->dev[i].page, dest_offset: sh2->dev[dd_idx].offset,
4571	src_offset: sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4572	submit: &submit);
4573
4574	set_bit(nr: R5_Expanded, addr: &sh2->dev[dd_idx].flags);
4575	set_bit(nr: R5_UPTODATE, addr: &sh2->dev[dd_idx].flags);
4576	for (j = `0`; j < conf->raid_disks; j++)
4577	if (j != sh2->pd_idx &&
4578	j != sh2->qd_idx &&
4579	!test_bit(R5_Expanded, &sh2->dev[j].flags))
4580	break;
4581	if (j == conf->raid_disks) {
4582	set_bit(nr: STRIPE_EXPAND_READY, addr: &sh2->state);
4583	set_bit(nr: STRIPE_HANDLE, addr: &sh2->state);
4584	}
4585	raid5_release_stripe(sh: sh2);
4586
4587	}
4588	/ done submitting copies, wait for them to complete /
4589	async_tx_quiesce(tx: &tx);
4590	}
4591
4592	/*
4593	* handle_stripe - do things to a stripe.
4594	*
4595	* We lock the stripe by setting STRIPE_ACTIVE and then examine the
4596	* state of various bits to see what needs to be done.
4597	* Possible results:
4598	* return some read requests which now have data
4599	* return some write requests which are safely on storage
4600	* schedule a read on some buffers
4601	* schedule a write of some buffers
4602	* return confirmation of parity correctness
4603	*
4604	*/
4605
4606	static void analyse_stripe(struct stripe_head sh, struct* stripe_head_state *s)
4607	{
4608	struct r5conf *conf = sh->raid_conf;
4609	int disks = sh->disks;
4610	struct r5dev *dev;
4611	int i;
4612	int do_recovery = `0`;
4613
4614	memset(s, `0`, sizeof(*s));
4615
4616	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4617	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4618	s->failed_num[`0`] = -`1`;
4619	s->failed_num[`1`] = -`1`;
4620	s->log_failed = r5l_log_disk_error(conf);
4621
4622	/ Now to look around and see what can be done /
4623	for (i=disks; i--; ) {
4624	struct md_rdev *rdev;
4625	int is_bad = `0`;
4626
4627	dev = &sh->dev[i];
4628
4629	pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4630	i, dev->flags,
4631	dev->toread, dev->towrite, dev->written);
4632	/ maybe we can reply to a read*
4633	*
4634	* new wantfill requests are only permitted while
4635	* ops_complete_biofill is guaranteed to be inactive
4636	*/
4637	if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4638	!test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4639	set_bit(nr: R5_Wantfill, addr: &dev->flags);
4640
4641	/ now count some things /
4642	if (test_bit(R5_LOCKED, &dev->flags))
4643	s->locked++;
4644	if (test_bit(R5_UPTODATE, &dev->flags))
4645	s->uptodate++;
4646	if (test_bit(R5_Wantcompute, &dev->flags)) {
4647	s->compute++;
4648	BUG_ON(s->compute > `2`);
4649	}
4650
4651	if (test_bit(R5_Wantfill, &dev->flags))
4652	s->to_fill++;
4653	else if (dev->toread)
4654	s->to_read++;
4655	if (dev->towrite) {
4656	s->to_write++;
4657	if (!test_bit(R5_OVERWRITE, &dev->flags))
4658	s->non_overwrite++;
4659	}
4660	if (dev->written)
4661	s->written++;
4662	/ Prefer to use the replacement for reads, but only*
4663	* if it is recovered enough and has no bad blocks.
4664	*/
4665	rdev = conf->disks[i].replacement;
4666	if (rdev && !test_bit(Faulty, &rdev->flags) &&
4667	rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4668	!rdev_has_badblock(rdev, s: sh->sector,
4669	RAID5_STRIPE_SECTORS(conf)))
4670	set_bit(nr: R5_ReadRepl, addr: &dev->flags);
4671	else {
4672	if (rdev && !test_bit(Faulty, &rdev->flags))
4673	set_bit(nr: R5_NeedReplace, addr: &dev->flags);
4674	else
4675	clear_bit(nr: R5_NeedReplace, addr: &dev->flags);
4676	rdev = conf->disks[i].rdev;
4677	clear_bit(nr: R5_ReadRepl, addr: &dev->flags);
4678	}
4679	if (rdev && test_bit(Faulty, &rdev->flags))
4680	rdev = NULL;
4681	if (rdev) {
4682	is_bad = rdev_has_badblock(rdev, s: sh->sector,
4683	RAID5_STRIPE_SECTORS(conf));
4684	if (s->blocked_rdev == NULL) {
4685	if (is_bad < `0`)
4686	set_bit(nr: BlockedBadBlocks, addr: &rdev->flags);
4687	if (rdev_blocked(rdev)) {
4688	s->blocked_rdev = rdev;
4689	atomic_inc(v: &rdev->nr_pending);
4690	}
4691	}
4692	}
4693	clear_bit(nr: R5_Insync, addr: &dev->flags);
4694	if (!rdev)
4695	/ Not in-sync /;
4696	else if (is_bad) {
4697	/ also not in-sync /
4698	if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4699	test_bit(R5_UPTODATE, &dev->flags)) {
4700	/ treat as in-sync, but with a read error*
4701	* which we can now try to correct
4702	*/
4703	set_bit(nr: R5_Insync, addr: &dev->flags);
4704	set_bit(nr: R5_ReadError, addr: &dev->flags);
4705	}
4706	} else if (test_bit(In_sync, &rdev->flags))
4707	set_bit(nr: R5_Insync, addr: &dev->flags);
4708	else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <=
4709	rdev->recovery_offset) {
4710	/*
4711	* in sync if:
4712	* - normal IO, or
4713	* - resync IO that is not lazy recovery
4714	*
4715	* For lazy recovery, we have to mark the rdev without
4716	* In_sync as failed, to build initial xor data.
4717	*/
4718	if (!test_bit(STRIPE_SYNCING, &sh->state) \|\|
4719	!test_bit(MD_RECOVERY_LAZY_RECOVER,
4720	&conf->mddev->recovery))
4721	set_bit(nr: R5_Insync, addr: &dev->flags);
4722	} else if (test_bit(R5_UPTODATE, &dev->flags) &&
4723	test_bit(R5_Expanded, &dev->flags))
4724	/ If we've reshaped into here, we assume it is Insync.*
4725	* We will shortly update recovery_offset to make
4726	* it official.
4727	*/
4728	set_bit(nr: R5_Insync, addr: &dev->flags);
4729
4730	if (test_bit(R5_WriteError, &dev->flags)) {
4731	/ This flag does not apply to '.replacement'*
4732	* only to .rdev, so make sure to check that*/
4733	struct md_rdev *rdev2 = conf->disks[i].rdev;
4734
4735	if (rdev2 == rdev)
4736	clear_bit(nr: R5_Insync, addr: &dev->flags);
4737	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4738	s->handle_bad_blocks = `1`;
4739	atomic_inc(v: &rdev2->nr_pending);
4740	} else
4741	clear_bit(nr: R5_WriteError, addr: &dev->flags);
4742	}
4743	if (test_bit(R5_MadeGood, &dev->flags)) {
4744	/ This flag does not apply to '.replacement'*
4745	* only to .rdev, so make sure to check that*/
4746	struct md_rdev *rdev2 = conf->disks[i].rdev;
4747
4748	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4749	s->handle_bad_blocks = `1`;
4750	atomic_inc(v: &rdev2->nr_pending);
4751	} else
4752	clear_bit(nr: R5_MadeGood, addr: &dev->flags);
4753	}
4754	if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4755	struct md_rdev *rdev2 = conf->disks[i].replacement;
4756
4757	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4758	s->handle_bad_blocks = `1`;
4759	atomic_inc(v: &rdev2->nr_pending);
4760	} else
4761	clear_bit(nr: R5_MadeGoodRepl, addr: &dev->flags);
4762	}
4763	if (!test_bit(R5_Insync, &dev->flags)) {
4764	/ The ReadError flag will just be confusing now /
4765	clear_bit(nr: R5_ReadError, addr: &dev->flags);
4766	clear_bit(nr: R5_ReWrite, addr: &dev->flags);
4767	}
4768	if (test_bit(R5_ReadError, &dev->flags))
4769	clear_bit(nr: R5_Insync, addr: &dev->flags);
4770	if (!test_bit(R5_Insync, &dev->flags)) {
4771	if (s->failed < `2`)
4772	s->failed_num[s->failed] = i;
4773	s->failed++;
4774	if (rdev && !test_bit(Faulty, &rdev->flags))
4775	do_recovery = `1`;
4776	else if (!rdev) {
4777	rdev = conf->disks[i].replacement;
4778	if (rdev && !test_bit(Faulty, &rdev->flags))
4779	do_recovery = `1`;
4780	}
4781	}
4782
4783	if (test_bit(R5_InJournal, &dev->flags))
4784	s->injournal++;
4785	if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4786	s->just_cached++;
4787	}
4788	if (test_bit(STRIPE_SYNCING, &sh->state)) {
4789	/ If there is a failed device being replaced,*
4790	* we must be recovering.
4791	* else if we are after resync_offset, we must be syncing
4792	* else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4793	* else we can only be replacing
4794	* sync and recovery both need to read all devices, and so
4795	* use the same flag.
4796	*/
4797	if (do_recovery \|\|
4798	sh->sector >= conf->mddev->resync_offset \|\|
4799	test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4800	s->syncing = `1`;
4801	else
4802	s->replacing = `1`;
4803	}
4804	}
4805
4806	/*
4807	* Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4808	* a head which can now be handled.
4809	*/
4810	static int clear_batch_ready(struct stripe_head *sh)
4811	{
4812	struct stripe_head *tmp;
4813	if (!test_and_clear_bit(nr: STRIPE_BATCH_READY, addr: &sh->state))
4814	return (sh->batch_head && sh->batch_head != sh);
4815	spin_lock(lock: &sh->stripe_lock);
4816	if (!sh->batch_head) {
4817	spin_unlock(lock: &sh->stripe_lock);
4818	return `0`;
4819	}
4820
4821	/*
4822	* this stripe could be added to a batch list before we check
4823	* BATCH_READY, skips it
4824	*/
4825	if (sh->batch_head != sh) {
4826	spin_unlock(lock: &sh->stripe_lock);
4827	return `1`;
4828	}
4829	spin_lock(lock: &sh->batch_lock);
4830	list_for_each_entry(tmp, &sh->batch_list, batch_list)
4831	clear_bit(nr: STRIPE_BATCH_READY, addr: &tmp->state);
4832	spin_unlock(lock: &sh->batch_lock);
4833	spin_unlock(lock: &sh->stripe_lock);
4834
4835	/*
4836	* BATCH_READY is cleared, no new stripes can be added.
4837	* batch_list can be accessed without lock
4838	*/
4839	return `0`;
4840	}
4841
4842	static void break_stripe_batch_list(struct stripe_head *head_sh,
4843	unsigned long handle_flags)
4844	{
4845	struct stripe_head sh, next;
4846	int i;
4847
4848	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4849
4850	list_del_init(entry: &sh->batch_list);
4851
4852	WARN_ONCE(sh->state & ((`1` << STRIPE_ACTIVE) \|
4853	(`1` << STRIPE_SYNCING) \|
4854	(`1` << STRIPE_REPLACED) \|
4855	(`1` << STRIPE_DELAYED) \|
4856	(`1` << STRIPE_BIT_DELAY) \|
4857	(`1` << STRIPE_FULL_WRITE) \|
4858	(`1` << STRIPE_BIOFILL_RUN) \|
4859	(`1` << STRIPE_COMPUTE_RUN) \|
4860	(`1` << STRIPE_DISCARD) \|
4861	(`1` << STRIPE_BATCH_READY) \|
4862	(`1` << STRIPE_BATCH_ERR)),
4863	"stripe state: %lx\n", sh->state);
4864	WARN_ONCE(head_sh->state & ((`1` << STRIPE_DISCARD) \|
4865	(`1` << STRIPE_REPLACED)),
4866	"head stripe state: %lx\n", head_sh->state);
4867
4868	set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS \|
4869	(`1` << STRIPE_PREREAD_ACTIVE) \|
4870	(`1` << STRIPE_ON_UNPLUG_LIST)),
4871	head_sh->state & (`1` << STRIPE_INSYNC));
4872
4873	sh->check_state = head_sh->check_state;
4874	sh->reconstruct_state = head_sh->reconstruct_state;
4875	spin_lock_irq(lock: &sh->stripe_lock);
4876	sh->batch_head = NULL;
4877	spin_unlock_irq(lock: &sh->stripe_lock);
4878	for (i = `0`; i < sh->disks; i++) {
4879	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
4880	wake_up_bit(word: &sh->dev[i].flags, bit: R5_Overlap);
4881	sh->dev[i].flags = head_sh->dev[i].flags &
4882	(~((`1` << R5_WriteError) \| (`1` << R5_Overlap)));
4883	}
4884	if (handle_flags == `0` \|\|
4885	sh->state & handle_flags)
4886	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4887	raid5_release_stripe(sh);
4888	}
4889	spin_lock_irq(lock: &head_sh->stripe_lock);
4890	head_sh->batch_head = NULL;
4891	spin_unlock_irq(lock: &head_sh->stripe_lock);
4892	for (i = `0`; i < head_sh->disks; i++)
4893	if (test_and_clear_bit(nr: R5_Overlap, addr: &head_sh->dev[i].flags))
4894	wake_up_bit(word: &head_sh->dev[i].flags, bit: R5_Overlap);
4895	if (head_sh->state & handle_flags)
4896	set_bit(nr: STRIPE_HANDLE, addr: &head_sh->state);
4897	}
4898
4899	static void handle_stripe(struct stripe_head *sh)
4900	{
4901	struct stripe_head_state s;
4902	struct r5conf *conf = sh->raid_conf;
4903	int i;
4904	int prexor;
4905	int disks = sh->disks;
4906	struct r5dev pdev, qdev;
4907
4908	clear_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4909
4910	/*
4911	* handle_stripe should not continue handle the batched stripe, only
4912	* the head of batch list or lone stripe can continue. Otherwise we
4913	* could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4914	* is set for the batched stripe.
4915	*/
4916	if (clear_batch_ready(sh))
4917	return;
4918
4919	if (test_and_set_bit_lock(nr: STRIPE_ACTIVE, addr: &sh->state)) {
4920	/ already being handled, ensure it gets handled*
4921	* again when current action finishes */
4922	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4923	return;
4924	}
4925
4926	if (test_and_clear_bit(nr: STRIPE_BATCH_ERR, addr: &sh->state))
4927	break_stripe_batch_list(head_sh: sh, handle_flags: `0`);
4928
4929	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4930	spin_lock(lock: &sh->stripe_lock);
4931	/*
4932	* Cannot process 'sync' concurrently with 'discard'.
4933	* Flush data in r5cache before 'sync'.
4934	*/
4935	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4936	!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4937	!test_bit(STRIPE_DISCARD, &sh->state) &&
4938	test_and_clear_bit(nr: STRIPE_SYNC_REQUESTED, addr: &sh->state)) {
4939	set_bit(nr: STRIPE_SYNCING, addr: &sh->state);
4940	clear_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4941	clear_bit(nr: STRIPE_REPLACED, addr: &sh->state);
4942	}
4943	spin_unlock(lock: &sh->stripe_lock);
4944	}
4945	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4946
4947	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4948	"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4949	(unsigned long long)sh->sector, sh->state,
4950	atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4951	sh->check_state, sh->reconstruct_state);
4952
4953	analyse_stripe(sh, s: &s);
4954
4955	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4956	goto finish;
4957
4958	if (s.handle_bad_blocks \|\|
4959	(md_is_rdwr(mddev: conf->mddev) &&
4960	test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags))) {
4961	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4962	goto finish;
4963	}
4964
4965	if (unlikely(s.blocked_rdev)) {
4966	if (s.syncing \|\| s.expanding \|\| s.expanded \|\|
4967	s.replacing \|\| s.to_write \|\| s.written) {
4968	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4969	goto finish;
4970	}
4971	/ There is nothing for the blocked_rdev to block /
4972	rdev_dec_pending(rdev: s.blocked_rdev, mddev: conf->mddev);
4973	s.blocked_rdev = NULL;
4974	}
4975
4976	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4977	set_bit(nr: STRIPE_OP_BIOFILL, addr: &s.ops_request);
4978	set_bit(nr: STRIPE_BIOFILL_RUN, addr: &sh->state);
4979	}
4980
4981	pr_debug("locked=%d uptodate=%d to_read=%d"
4982	" to_write=%d failed=%d failed_num=%d,%d\n",
4983	s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4984	s.failed_num[`0`], s.failed_num[`1`]);
4985	/*
4986	* check if the array has lost more than max_degraded devices and,
4987	* if so, some requests might need to be failed.
4988	*
4989	* When journal device failed (log_failed), we will only process
4990	* the stripe if there is data need write to raid disks
4991	*/
4992	if (s.failed > conf->max_degraded \|\|
4993	(s.log_failed && s.injournal == `0`)) {
4994	sh->check_state = `0`;
4995	sh->reconstruct_state = `0`;
4996	break_stripe_batch_list(head_sh: sh, handle_flags: `0`);
4997	if (s.to_read+s.to_write+s.written)
4998	handle_failed_stripe(conf, sh, s: &s, disks);
4999	if (s.syncing + s.replacing)
5000	handle_failed_sync(conf, sh, s: &s);
5001	}
5002
5003	/ Now we check to see if any write operations have recently*
5004	* completed
5005	*/
5006	prexor = `0`;
5007	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
5008	prexor = `1`;
5009	if (sh->reconstruct_state == reconstruct_state_drain_result \|\|
5010	sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
5011	sh->reconstruct_state = reconstruct_state_idle;
5012
5013	/ All the 'written' buffers and the parity block are ready to*
5014	* be written back to disk
5015	*/
5016	BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
5017	!test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
5018	BUG_ON(sh->qd_idx >= `0` &&
5019	!test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
5020	!test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
5021	for (i = disks; i--; ) {
5022	struct r5dev *dev = &sh->dev[i];
5023	if (test_bit(R5_LOCKED, &dev->flags) &&
5024	(i == sh->pd_idx \|\| i == sh->qd_idx \|\|
5025	dev->written \|\| test_bit(R5_InJournal,
5026	&dev->flags))) {
5027	pr_debug("Writing block %d\n", i);
5028	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
5029	if (prexor)
5030	continue;
5031	if (s.failed > `1`)
5032	continue;
5033	if (!test_bit(R5_Insync, &dev->flags) \|\|
5034	((i == sh->pd_idx \|\| i == sh->qd_idx) &&
5035	s.failed == `0`))
5036	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
5037	}
5038	}
5039	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5040	s.dec_preread_active = `1`;
5041	}
5042
5043	/*
5044	* might be able to return some write requests if the parity blocks
5045	* are safe, or on a failed drive
5046	*/
5047	pdev = &sh->dev[sh->pd_idx];
5048	s.p_failed = (s.failed >= `1` && s.failed_num[`0`] == sh->pd_idx)
5049	\|\| (s.failed >= `2` && s.failed_num[`1`] == sh->pd_idx);
5050	qdev = &sh->dev[sh->qd_idx];
5051	s.q_failed = (s.failed >= `1` && s.failed_num[`0`] == sh->qd_idx)
5052	\|\| (s.failed >= `2` && s.failed_num[`1`] == sh->qd_idx)
5053	\|\| conf->level < `6`;
5054
5055	if (s.written &&
5056	(s.p_failed \|\| ((test_bit(R5_Insync, &pdev->flags)
5057	&& !test_bit(R5_LOCKED, &pdev->flags)
5058	&& (test_bit(R5_UPTODATE, &pdev->flags) \|\|
5059	test_bit(R5_Discard, &pdev->flags))))) &&
5060	(s.q_failed \|\| ((test_bit(R5_Insync, &qdev->flags)
5061	&& !test_bit(R5_LOCKED, &qdev->flags)
5062	&& (test_bit(R5_UPTODATE, &qdev->flags) \|\|
5063	test_bit(R5_Discard, &qdev->flags))))))
5064	handle_stripe_clean_event(conf, sh, disks);
5065
5066	if (s.just_cached)
5067	r5c_handle_cached_data_endio(conf, sh, disks);
5068	log_stripe_write_finished(sh);
5069
5070	/ Now we might consider reading some blocks, either to check/generate*
5071	* parity, or to satisfy requests
5072	* or to load a block that is being partially written.
5073	*/
5074	if (s.to_read \|\| s.non_overwrite
5075	\|\| (s.to_write && s.failed)
5076	\|\| (s.syncing && (s.uptodate + s.compute < disks))
5077	\|\| s.replacing
5078	\|\| s.expanding)
5079	handle_stripe_fill(sh, s: &s, disks);
5080
5081	/*
5082	* When the stripe finishes full journal write cycle (write to journal
5083	* and raid disk), this is the clean up procedure so it is ready for
5084	* next operation.
5085	*/
5086	r5c_finish_stripe_write_out(conf, sh, s: &s);
5087
5088	/*
5089	* Now to consider new write requests, cache write back and what else,
5090	* if anything should be read. We do not handle new writes when:
5091	* 1/ A 'write' operation (copy+xor) is already in flight.
5092	* 2/ A 'check' operation is in flight, as it may clobber the parity
5093	* block.
5094	* 3/ A r5c cache log write is in flight.
5095	*/
5096
5097	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5098	if (!r5c_is_writeback(log: conf->log)) {
5099	if (s.to_write)
5100	handle_stripe_dirtying(conf, sh, s: &s, disks);
5101	} else { / write back cache /
5102	int ret = `0`;
5103
5104	/ First, try handle writes in caching phase /
5105	if (s.to_write)
5106	ret = r5c_try_caching_write(conf, sh, s: &s,
5107	disks);
5108	/*
5109	* If caching phase failed: ret == -EAGAIN
5110	* OR
5111	* stripe under reclaim: !caching && injournal
5112	*
5113	* fall back to handle_stripe_dirtying()
5114	*/
5115	if (ret == -EAGAIN \|\|
5116	/ stripe under reclaim: !caching && injournal /
5117	(!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5118	s.injournal > `0`)) {
5119	ret = handle_stripe_dirtying(conf, sh, s: &s,
5120	disks);
5121	if (ret == -EAGAIN)
5122	goto finish;
5123	}
5124	}
5125	}
5126
5127	/ maybe we need to check and possibly fix the parity for this stripe*
5128	* Any reads will already have been scheduled, so we just see if enough
5129	* data is available. The parity check is held off while parity
5130	* dependent operations are in flight.
5131	*/
5132	if (sh->check_state \|\|
5133	(s.syncing && s.locked == `0` &&
5134	!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5135	!test_bit(STRIPE_INSYNC, &sh->state))) {
5136	if (conf->level == `6`)
5137	handle_parity_checks6(conf, sh, s: &s, disks);
5138	else
5139	handle_parity_checks5(conf, sh, s: &s, disks);
5140	}
5141
5142	if ((s.replacing \|\| s.syncing) && s.locked == `0`
5143	&& !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5144	&& !test_bit(STRIPE_REPLACED, &sh->state)) {
5145	/ Write out to replacement devices where possible /
5146	for (i = `0`; i < conf->raid_disks; i++)
5147	if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5148	WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5149	set_bit(nr: R5_WantReplace, addr: &sh->dev[i].flags);
5150	set_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
5151	s.locked++;
5152	}
5153	if (s.replacing)
5154	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
5155	set_bit(nr: STRIPE_REPLACED, addr: &sh->state);
5156	}
5157	if ((s.syncing \|\| s.replacing) && s.locked == `0` &&
5158	!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5159	test_bit(STRIPE_INSYNC, &sh->state)) {
5160	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: `1`);
5161	clear_bit(nr: STRIPE_SYNCING, addr: &sh->state);
5162	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags))
5163	wake_up_bit(word: &sh->dev[sh->pd_idx].flags, bit: R5_Overlap);
5164	}
5165
5166	/ If the failed drives are just a ReadError, then we might need*
5167	* to progress the repair/check process
5168	*/
5169	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5170	for (i = `0`; i < s.failed; i++) {
5171	struct r5dev *dev = &sh->dev[s.failed_num[i]];
5172	if (test_bit(R5_ReadError, &dev->flags)
5173	&& !test_bit(R5_LOCKED, &dev->flags)
5174	&& test_bit(R5_UPTODATE, &dev->flags)
5175	) {
5176	if (!test_bit(R5_ReWrite, &dev->flags)) {
5177	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
5178	set_bit(nr: R5_ReWrite, addr: &dev->flags);
5179	} else
5180	/ let's read it back /
5181	set_bit(nr: R5_Wantread, addr: &dev->flags);
5182	set_bit(nr: R5_LOCKED, addr: &dev->flags);
5183	s.locked++;
5184	}
5185	}
5186
5187	/ Finish reconstruct operations initiated by the expansion process /
5188	if (sh->reconstruct_state == reconstruct_state_result) {
5189	struct stripe_head *sh_src
5190	= raid5_get_active_stripe(conf, NULL, sector: sh->sector,
5191	R5_GAS_PREVIOUS \| R5_GAS_NOBLOCK \|
5192	R5_GAS_NOQUIESCE);
5193	if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5194	/ sh cannot be written until sh_src has been read.*
5195	* so arrange for sh to be delayed a little
5196	*/
5197	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5198	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5199	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE,
5200	addr: &sh_src->state))
5201	atomic_inc(v: &conf->preread_active_stripes);
5202	raid5_release_stripe(sh: sh_src);
5203	goto finish;
5204	}
5205	if (sh_src)
5206	raid5_release_stripe(sh: sh_src);
5207
5208	sh->reconstruct_state = reconstruct_state_idle;
5209	clear_bit(nr: STRIPE_EXPANDING, addr: &sh->state);
5210	for (i = conf->raid_disks; i--; ) {
5211	set_bit(nr: R5_Wantwrite, addr: &sh->dev[i].flags);
5212	set_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
5213	s.locked++;
5214	}
5215	}
5216
5217	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5218	!sh->reconstruct_state) {
5219	/ Need to write out all blocks after computing parity /
5220	sh->disks = conf->raid_disks;
5221	stripe_set_idx(stripe: sh->sector, conf, previous: `0`, sh);
5222	schedule_reconstruction(sh, s: &s, rcw: `1`, expand: `1`);
5223	} else if (s.expanded && !sh->reconstruct_state && s.locked == `0`) {
5224	clear_bit(nr: STRIPE_EXPAND_READY, addr: &sh->state);
5225	atomic_dec(v: &conf->reshape_stripes);
5226	wake_up(&conf->wait_for_reshape);
5227	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: `1`);
5228	}
5229
5230	if (s.expanding && s.locked == `0` &&
5231	!test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5232	handle_stripe_expansion(conf, sh);
5233
5234	finish:
5235	/ wait for this device to become unblocked /
5236	if (unlikely(s.blocked_rdev)) {
5237	if (conf->mddev->external)
5238	md_wait_for_blocked_rdev(rdev: s.blocked_rdev,
5239	mddev: conf->mddev);
5240	else
5241	/ Internal metadata will immediately*
5242	* be written by raid5d, so we don't
5243	* need to wait here.
5244	*/
5245	rdev_dec_pending(rdev: s.blocked_rdev,
5246	mddev: conf->mddev);
5247	}
5248
5249	if (s.handle_bad_blocks)
5250	for (i = disks; i--; ) {
5251	struct md_rdev *rdev;
5252	struct r5dev *dev = &sh->dev[i];
5253	if (test_and_clear_bit(nr: R5_WriteError, addr: &dev->flags)) {
5254	/ We own a safe reference to the rdev /
5255	rdev = conf->disks[i].rdev;
5256	if (!rdev_set_badblocks(rdev, s: sh->sector,
5257	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
5258	md_error(mddev: conf->mddev, rdev);
5259	rdev_dec_pending(rdev, mddev: conf->mddev);
5260	}
5261	if (test_and_clear_bit(nr: R5_MadeGood, addr: &dev->flags)) {
5262	rdev = conf->disks[i].rdev;
5263	rdev_clear_badblocks(rdev, s: sh->sector,
5264	RAID5_STRIPE_SECTORS(conf), is_new: `0`);
5265	rdev_dec_pending(rdev, mddev: conf->mddev);
5266	}
5267	if (test_and_clear_bit(nr: R5_MadeGoodRepl, addr: &dev->flags)) {
5268	rdev = conf->disks[i].replacement;
5269	if (!rdev)
5270	/ rdev have been moved down /
5271	rdev = conf->disks[i].rdev;
5272	rdev_clear_badblocks(rdev, s: sh->sector,
5273	RAID5_STRIPE_SECTORS(conf), is_new: `0`);
5274	rdev_dec_pending(rdev, mddev: conf->mddev);
5275	}
5276	}
5277
5278	if (s.ops_request)
5279	raid_run_ops(sh, ops_request: s.ops_request);
5280
5281	ops_run_io(sh, s: &s);
5282
5283	if (s.dec_preread_active) {
5284	/ We delay this until after ops_run_io so that if make_request*
5285	* is waiting on a flush, it won't continue until the writes
5286	* have actually been submitted.
5287	*/
5288	atomic_dec(v: &conf->preread_active_stripes);
5289	if (atomic_read(v: &conf->preread_active_stripes) <
5290	IO_THRESHOLD)
5291	md_wakeup_thread(conf->mddev->thread);
5292	}
5293
5294	clear_bit_unlock(nr: STRIPE_ACTIVE, addr: &sh->state);
5295	}
5296
5297	static void raid5_activate_delayed(struct r5conf *conf)
5298	__must_hold(&conf->device_lock)
5299	{
5300	if (atomic_read(v: &conf->preread_active_stripes) < IO_THRESHOLD) {
5301	while (!list_empty(head: &conf->delayed_list)) {
5302	struct list_head *l = conf->delayed_list.next;
5303	struct stripe_head *sh;
5304	sh = list_entry(l, struct stripe_head, lru);
5305	list_del_init(entry: l);
5306	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5307	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5308	atomic_inc(v: &conf->preread_active_stripes);
5309	list_add_tail(new: &sh->lru, head: &conf->hold_list);
5310	raid5_wakeup_stripe_thread(sh);
5311	}
5312	}
5313	}
5314
5315	static void activate_bit_delay(struct r5conf *conf,
5316	struct list_head *temp_inactive_list)
5317	__must_hold(&conf->device_lock)
5318	{
5319	struct list_head head;
5320	list_add(new: &head, head: &conf->bitmap_list);
5321	list_del_init(entry: &conf->bitmap_list);
5322	while (!list_empty(head: &head)) {
5323	struct stripe_head sh = list_entry(head.next, struct* stripe_head, lru);
5324	int hash;
5325	list_del_init(entry: &sh->lru);
5326	atomic_inc(v: &sh->count);
5327	hash = sh->hash_lock_index;
5328	__release_stripe(conf, sh, temp_inactive_list: &temp_inactive_list[hash]);
5329	}
5330	}
5331
5332	static int in_chunk_boundary(struct mddev mddev, struct* bio *bio)
5333	{
5334	struct r5conf *conf = mddev->private;
5335	sector_t sector = bio->bi_iter.bi_sector;
5336	unsigned int chunk_sectors;
5337	unsigned int bio_sectors = bio_sectors(bio);
5338
5339	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5340	return chunk_sectors >=
5341	((sector & (chunk_sectors - `1`)) + bio_sectors);
5342	}
5343
5344	/*
5345	* add bio to the retry LIFO ( in O(1) ... we are in interrupt )
5346	* later sampled by raid5d.
5347	*/
5348	static void add_bio_to_retry(struct bio bi,struct* r5conf *conf)
5349	{
5350	unsigned long flags;
5351
5352	spin_lock_irqsave(&conf->device_lock, flags);
5353
5354	bi->bi_next = conf->retry_read_aligned_list;
5355	conf->retry_read_aligned_list = bi;
5356
5357	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
5358	md_wakeup_thread(conf->mddev->thread);
5359	}
5360
5361	static struct bio remove_bio_from_retry(struct* r5conf *conf,
5362	unsigned int *offset)
5363	{
5364	struct bio *bi;
5365
5366	bi = conf->retry_read_aligned;
5367	if (bi) {
5368	*offset = conf->retry_read_offset;
5369	conf->retry_read_aligned = NULL;
5370	return bi;
5371	}
5372	bi = conf->retry_read_aligned_list;
5373	if(bi) {
5374	conf->retry_read_aligned_list = bi->bi_next;
5375	bi->bi_next = NULL;
5376	*offset = `0`;
5377	}
5378
5379	return bi;
5380	}
5381
5382	/*
5383	* The "raid5_align_endio" should check if the read succeeded and if it
5384	* did, call bio_endio on the original bio (having bio_put the new bio
5385	* first).
5386	* If the read failed..
5387	*/
5388	static void raid5_align_endio(struct bio *bi)
5389	{
5390	struct bio *raid_bi = bi->bi_private;
5391	struct md_rdev rdev = (void* *)raid_bi->bi_next;
5392	struct mddev *mddev = rdev->mddev;
5393	struct r5conf *conf = mddev->private;
5394	blk_status_t error = bi->bi_status;
5395
5396	bio_put(bi);
5397	raid_bi->bi_next = NULL;
5398	rdev_dec_pending(rdev, mddev: conf->mddev);
5399
5400	if (!error) {
5401	bio_endio(raid_bi);
5402	if (atomic_dec_and_test(v: &conf->active_aligned_reads))
5403	wake_up(&conf->wait_for_quiescent);
5404	return;
5405	}
5406
5407	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5408
5409	add_bio_to_retry(bi: raid_bi, conf);
5410	}
5411
5412	static int raid5_read_one_chunk(struct mddev mddev, struct* bio *raid_bio)
5413	{
5414	struct r5conf *conf = mddev->private;
5415	struct bio *align_bio;
5416	struct md_rdev *rdev;
5417	sector_t sector, end_sector;
5418	int dd_idx;
5419	bool did_inc;
5420
5421	if (!in_chunk_boundary(mddev, bio: raid_bio)) {
5422	pr_debug("%s: non aligned\n", __func__);
5423	return `0`;
5424	}
5425
5426	sector = raid5_compute_sector(conf, r_sector: raid_bio->bi_iter.bi_sector, previous: `0`,
5427	dd_idx: &dd_idx, NULL);
5428	end_sector = sector + bio_sectors(raid_bio);
5429
5430	if (r5c_big_stripe_cached(conf, sect: sector))
5431	return `0`;
5432
5433	rdev = conf->disks[dd_idx].replacement;
5434	if (!rdev \|\| test_bit(Faulty, &rdev->flags) \|\|
5435	rdev->recovery_offset < end_sector) {
5436	rdev = conf->disks[dd_idx].rdev;
5437	if (!rdev)
5438	return `0`;
5439	if (test_bit(Faulty, &rdev->flags) \|\|
5440	!(test_bit(In_sync, &rdev->flags) \|\|
5441	rdev->recovery_offset >= end_sector))
5442	return `0`;
5443	}
5444
5445	atomic_inc(v: &rdev->nr_pending);
5446
5447	if (rdev_has_badblock(rdev, s: sector, bio_sectors(raid_bio))) {
5448	rdev_dec_pending(rdev, mddev);
5449	return `0`;
5450	}
5451
5452	md_account_bio(mddev, bio: &raid_bio);
5453	raid_bio->bi_next = (void *)rdev;
5454
5455	align_bio = bio_alloc_clone(bdev: rdev->bdev, bio_src: raid_bio, GFP_NOIO,
5456	bs: &mddev->bio_set);
5457	align_bio->bi_end_io = raid5_align_endio;
5458	align_bio->bi_private = raid_bio;
5459	align_bio->bi_iter.bi_sector = sector;
5460
5461	/ No reshape active, so we can trust rdev->data_offset /
5462	align_bio->bi_iter.bi_sector += rdev->data_offset;
5463
5464	did_inc = false;
5465	if (conf->quiesce == `0`) {
5466	atomic_inc(v: &conf->active_aligned_reads);
5467	did_inc = true;
5468	}
5469	/ need a memory barrier to detect the race with raid5_quiesce() /
5470	if (!did_inc \|\| smp_load_acquire(&conf->quiesce) != `0`) {
5471	/ quiesce is in progress, so we need to undo io activation and wait*
5472	* for it to finish
5473	*/
5474	if (did_inc && atomic_dec_and_test(v: &conf->active_aligned_reads))
5475	wake_up(&conf->wait_for_quiescent);
5476	spin_lock_irq(lock: &conf->device_lock);
5477	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == `0`,
5478	conf->device_lock);
5479	atomic_inc(v: &conf->active_aligned_reads);
5480	spin_unlock_irq(lock: &conf->device_lock);
5481	}
5482
5483	mddev_trace_remap(mddev, bio: align_bio, sector: raid_bio->bi_iter.bi_sector);
5484	submit_bio_noacct(bio: align_bio);
5485	return `1`;
5486	}
5487
5488	static struct bio chunk_aligned_read(struct* mddev mddev, struct* bio *raid_bio)
5489	{
5490	sector_t sector = raid_bio->bi_iter.bi_sector;
5491	unsigned chunk_sects = mddev->chunk_sectors;
5492	unsigned sectors = chunk_sects - (sector & (chunk_sects-`1`));
5493
5494	if (sectors < bio_sectors(raid_bio)) {
5495	struct r5conf *conf = mddev->private;
5496
5497	raid_bio = bio_submit_split_bioset(bio: raid_bio, split_sectors: sectors,
5498	bs: &conf->bio_split);
5499	if (!raid_bio)
5500	return NULL;
5501	}
5502
5503	if (!raid5_read_one_chunk(mddev, raid_bio))
5504	return raid_bio;
5505
5506	return NULL;
5507	}
5508
5509	/ __get_priority_stripe - get the next stripe to process*
5510	*
5511	* Full stripe writes are allowed to pass preread active stripes up until
5512	* the bypass_threshold is exceeded. In general the bypass_count
5513	* increments when the handle_list is handled before the hold_list; however, it
5514	* will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5515	* stripe with in flight i/o. The bypass_count will be reset when the
5516	* head of the hold_list has changed, i.e. the head was promoted to the
5517	* handle_list.
5518	*/
5519	static struct stripe_head __get_priority_stripe(struct* r5conf conf, int* group)
5520	__must_hold(&conf->device_lock)
5521	{
5522	struct stripe_head sh, tmp;
5523	struct list_head *handle_list = NULL;
5524	struct r5worker_group *wg;
5525	bool second_try = !r5c_is_writeback(log: conf->log) &&
5526	!r5l_log_disk_error(conf);
5527	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) \|\|
5528	r5l_log_disk_error(conf);
5529
5530	again:
5531	wg = NULL;
5532	sh = NULL;
5533	if (conf->worker_cnt_per_group == `0`) {
5534	handle_list = try_loprio ? &conf->loprio_list :
5535	&conf->handle_list;
5536	} else if (group != ANY_GROUP) {
5537	handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5538	&conf->worker_groups[group].handle_list;
5539	wg = &conf->worker_groups[group];
5540	} else {
5541	int i;
5542	for (i = `0`; i < conf->group_cnt; i++) {
5543	handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5544	&conf->worker_groups[i].handle_list;
5545	wg = &conf->worker_groups[i];
5546	if (!list_empty(head: handle_list))
5547	break;
5548	}
5549	}
5550
5551	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5552	__func__,
5553	list_empty(handle_list) ? "empty" : "busy",
5554	list_empty(&conf->hold_list) ? "empty" : "busy",
5555	atomic_read(&conf->pending_full_writes), conf->bypass_count);
5556
5557	if (!list_empty(head: handle_list)) {
5558	sh = list_entry(handle_list->next, typeof(*sh), lru);
5559
5560	if (list_empty(head: &conf->hold_list))
5561	conf->bypass_count = `0`;
5562	else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5563	if (conf->hold_list.next == conf->last_hold)
5564	conf->bypass_count++;
5565	else {
5566	conf->last_hold = conf->hold_list.next;
5567	conf->bypass_count -= conf->bypass_threshold;
5568	if (conf->bypass_count < `0`)
5569	conf->bypass_count = `0`;
5570	}
5571	}
5572	} else if (!list_empty(head: &conf->hold_list) &&
5573	((conf->bypass_threshold &&
5574	conf->bypass_count > conf->bypass_threshold) \|\|
5575	atomic_read(v: &conf->pending_full_writes) == `0`)) {
5576
5577	list_for_each_entry(tmp, &conf->hold_list, lru) {
5578	if (conf->worker_cnt_per_group == `0` \|\|
5579	group == ANY_GROUP \|\|
5580	!cpu_online(cpu: tmp->cpu) \|\|
5581	cpu_to_group(tmp->cpu) == group) {
5582	sh = tmp;
5583	break;
5584	}
5585	}
5586
5587	if (sh) {
5588	conf->bypass_count -= conf->bypass_threshold;
5589	if (conf->bypass_count < `0`)
5590	conf->bypass_count = `0`;
5591	}
5592	wg = NULL;
5593	}
5594
5595	if (!sh) {
5596	if (second_try)
5597	return NULL;
5598	second_try = true;
5599	try_loprio = !try_loprio;
5600	goto again;
5601	}
5602
5603	if (wg) {
5604	wg->stripes_cnt--;
5605	sh->group = NULL;
5606	}
5607	list_del_init(entry: &sh->lru);
5608	BUG_ON(atomic_inc_return(&sh->count) != `1`);
5609	return sh;
5610	}
5611
5612	struct raid5_plug_cb {
5613	struct blk_plug_cb cb;
5614	struct list_head list;
5615	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5616	};
5617
5618	static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5619	{
5620	struct raid5_plug_cb *cb = container_of(
5621	blk_cb, struct raid5_plug_cb, cb);
5622	struct stripe_head *sh;
5623	struct mddev *mddev = cb->cb.data;
5624	struct r5conf *conf = mddev->private;
5625	int cnt = `0`;
5626	int hash;
5627
5628	if (cb->list.next && !list_empty(head: &cb->list)) {
5629	spin_lock_irq(lock: &conf->device_lock);
5630	while (!list_empty(head: &cb->list)) {
5631	sh = list_first_entry(&cb->list, struct stripe_head, lru);
5632	list_del_init(entry: &sh->lru);
5633	/*
5634	* avoid race release_stripe_plug() sees
5635	* STRIPE_ON_UNPLUG_LIST clear but the stripe
5636	* is still in our list
5637	*/
5638	smp_mb__before_atomic();
5639	clear_bit(nr: STRIPE_ON_UNPLUG_LIST, addr: &sh->state);
5640	/*
5641	* STRIPE_ON_RELEASE_LIST could be set here. In that
5642	* case, the count is always > 1 here
5643	*/
5644	hash = sh->hash_lock_index;
5645	__release_stripe(conf, sh, temp_inactive_list: &cb->temp_inactive_list[hash]);
5646	cnt++;
5647	}
5648	spin_unlock_irq(lock: &conf->device_lock);
5649	}
5650	release_inactive_stripe_list(conf, temp_inactive_list: cb->temp_inactive_list,
5651	NR_STRIPE_HASH_LOCKS);
5652	if (!mddev_is_dm(mddev))
5653	trace_block_unplug(q: mddev->gendisk->queue, depth: cnt, explicit: !from_schedule);
5654	kfree(objp: cb);
5655	}
5656
5657	static void release_stripe_plug(struct mddev *mddev,
5658	struct stripe_head *sh)
5659	{
5660	struct blk_plug_cb *blk_cb = blk_check_plugged(
5661	unplug: raid5_unplug, data: mddev,
5662	size: sizeof(struct raid5_plug_cb));
5663	struct raid5_plug_cb *cb;
5664
5665	if (!blk_cb) {
5666	raid5_release_stripe(sh);
5667	return;
5668	}
5669
5670	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5671
5672	if (cb->list.next == NULL) {
5673	int i;
5674	INIT_LIST_HEAD(list: &cb->list);
5675	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
5676	INIT_LIST_HEAD(list: cb->temp_inactive_list + i);
5677	}
5678
5679	if (!test_and_set_bit(nr: STRIPE_ON_UNPLUG_LIST, addr: &sh->state))
5680	list_add_tail(new: &sh->lru, head: &cb->list);
5681	else
5682	raid5_release_stripe(sh);
5683	}
5684
5685	static void make_discard_request(struct mddev mddev, struct* bio *bi)
5686	{
5687	struct r5conf *conf = mddev->private;
5688	sector_t logical_sector, last_sector;
5689	struct stripe_head *sh;
5690	int stripe_sectors;
5691
5692	/ We need to handle this when io_uring supports discard/trim /
5693	if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
5694	return;
5695
5696	if (mddev->reshape_position != MaxSector)
5697	/ Skip discard while reshape is happening /
5698	return;
5699
5700	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
5701	last_sector = bio_end_sector(bi);
5702
5703	bi->bi_next = NULL;
5704
5705	stripe_sectors = conf->chunk_sectors *
5706	(conf->raid_disks - conf->max_degraded);
5707	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5708	stripe_sectors);
5709	sector_div(last_sector, stripe_sectors);
5710
5711	logical_sector *= conf->chunk_sectors;
5712	last_sector *= conf->chunk_sectors;
5713
5714	for (; logical_sector < last_sector;
5715	logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5716	DEFINE_WAIT(w);
5717	int d;
5718	again:
5719	sh = raid5_get_active_stripe(conf, NULL, sector: logical_sector, flags: `0`);
5720	set_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags);
5721	if (test_bit(STRIPE_SYNCING, &sh->state)) {
5722	raid5_release_stripe(sh);
5723	wait_on_bit(word: &sh->dev[sh->pd_idx].flags, bit: R5_Overlap,
5724	TASK_UNINTERRUPTIBLE);
5725	goto again;
5726	}
5727	clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags);
5728	spin_lock_irq(lock: &sh->stripe_lock);
5729	for (d = `0`; d < conf->raid_disks; d++) {
5730	if (d == sh->pd_idx \|\| d == sh->qd_idx)
5731	continue;
5732	if (sh->dev[d].towrite \|\| sh->dev[d].toread) {
5733	set_bit(nr: R5_Overlap, addr: &sh->dev[d].flags);
5734	spin_unlock_irq(lock: &sh->stripe_lock);
5735	raid5_release_stripe(sh);
5736	wait_on_bit(word: &sh->dev[d].flags, bit: R5_Overlap,
5737	TASK_UNINTERRUPTIBLE);
5738	goto again;
5739	}
5740	}
5741	set_bit(nr: STRIPE_DISCARD, addr: &sh->state);
5742	sh->overwrite_disks = `0`;
5743	for (d = `0`; d < conf->raid_disks; d++) {
5744	if (d == sh->pd_idx \|\| d == sh->qd_idx)
5745	continue;
5746	sh->dev[d].towrite = bi;
5747	set_bit(nr: R5_OVERWRITE, addr: &sh->dev[d].flags);
5748	bio_inc_remaining(bio: bi);
5749	md_write_inc(mddev, bi);
5750	sh->overwrite_disks++;
5751	}
5752	spin_unlock_irq(lock: &sh->stripe_lock);
5753	if (conf->mddev->bitmap) {
5754	sh->bm_seq = conf->seq_flush + `1`;
5755	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
5756	}
5757
5758	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5759	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5760	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5761	atomic_inc(v: &conf->preread_active_stripes);
5762	release_stripe_plug(mddev, sh);
5763	}
5764
5765	bio_endio(bi);
5766	}
5767
5768	static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
5769	sector_t reshape_sector)
5770	{
5771	return mddev->reshape_backwards ? sector < reshape_sector :
5772	sector >= reshape_sector;
5773	}
5774
5775	static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
5776	sector_t max, sector_t reshape_sector)
5777	{
5778	return mddev->reshape_backwards ? max < reshape_sector :
5779	min >= reshape_sector;
5780	}
5781
5782	static bool stripe_ahead_of_reshape(struct mddev mddev, struct* r5conf *conf,
5783	struct stripe_head *sh)
5784	{
5785	sector_t max_sector = `0`, min_sector = MaxSector;
5786	bool ret = false;
5787	int dd_idx;
5788
5789	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5790	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5791	continue;
5792
5793	min_sector = min(min_sector, sh->dev[dd_idx].sector);
5794	max_sector = max(max_sector, sh->dev[dd_idx].sector);
5795	}
5796
5797	spin_lock_irq(lock: &conf->device_lock);
5798
5799	if (!range_ahead_of_reshape(mddev, min: min_sector, max: max_sector,
5800	reshape_sector: conf->reshape_progress))
5801	/ mismatch, need to try again /
5802	ret = true;
5803
5804	spin_unlock_irq(lock: &conf->device_lock);
5805
5806	return ret;
5807	}
5808
5809	static int add_all_stripe_bios(struct r5conf *conf,
5810	struct stripe_request_ctx ctx, struct* stripe_head *sh,
5811	struct bio bi, int* forwrite, int previous)
5812	{
5813	int dd_idx;
5814
5815	spin_lock_irq(lock: &sh->stripe_lock);
5816
5817	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5818	struct r5dev *dev = &sh->dev[dd_idx];
5819
5820	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5821	continue;
5822
5823	if (dev->sector < ctx->first_sector \|\|
5824	dev->sector >= ctx->last_sector)
5825	continue;
5826
5827	if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
5828	set_bit(nr: R5_Overlap, addr: &dev->flags);
5829	spin_unlock_irq(lock: &sh->stripe_lock);
5830	raid5_release_stripe(sh);
5831	/ release batch_last before wait to avoid risk of deadlock /
5832	if (ctx->batch_last) {
5833	raid5_release_stripe(sh: ctx->batch_last);
5834	ctx->batch_last = NULL;
5835	}
5836	md_wakeup_thread(conf->mddev->thread);
5837	wait_on_bit(word: &dev->flags, bit: R5_Overlap, TASK_UNINTERRUPTIBLE);
5838	return `0`;
5839	}
5840	}
5841
5842	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5843	struct r5dev *dev = &sh->dev[dd_idx];
5844
5845	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5846	continue;
5847
5848	if (dev->sector < ctx->first_sector \|\|
5849	dev->sector >= ctx->last_sector)
5850	continue;
5851
5852	__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
5853	clear_bit(nr: (dev->sector - ctx->first_sector) >>
5854	RAID5_STRIPE_SHIFT(conf), addr: ctx->sectors_to_do);
5855	}
5856
5857	spin_unlock_irq(lock: &sh->stripe_lock);
5858	return `1`;
5859	}
5860
5861	enum reshape_loc {
5862	LOC_NO_RESHAPE,
5863	LOC_AHEAD_OF_RESHAPE,
5864	LOC_INSIDE_RESHAPE,
5865	LOC_BEHIND_RESHAPE,
5866	};
5867
5868	static enum reshape_loc get_reshape_loc(struct mddev *mddev,
5869	struct r5conf *conf, sector_t logical_sector)
5870	{
5871	sector_t reshape_progress, reshape_safe;
5872
5873	if (likely(conf->reshape_progress == MaxSector))
5874	return LOC_NO_RESHAPE;
5875	/*
5876	* Spinlock is needed as reshape_progress may be
5877	* 64bit on a 32bit platform, and so it might be
5878	* possible to see a half-updated value
5879	* Of course reshape_progress could change after
5880	* the lock is dropped, so once we get a reference
5881	* to the stripe that we think it is, we will have
5882	* to check again.
5883	*/
5884	spin_lock_irq(lock: &conf->device_lock);
5885	reshape_progress = conf->reshape_progress;
5886	reshape_safe = conf->reshape_safe;
5887	spin_unlock_irq(lock: &conf->device_lock);
5888	if (reshape_progress == MaxSector)
5889	return LOC_NO_RESHAPE;
5890	if (ahead_of_reshape(mddev, sector: logical_sector, reshape_sector: reshape_progress))
5891	return LOC_AHEAD_OF_RESHAPE;
5892	if (ahead_of_reshape(mddev, sector: logical_sector, reshape_sector: reshape_safe))
5893	return LOC_INSIDE_RESHAPE;
5894	return LOC_BEHIND_RESHAPE;
5895	}
5896
5897	static void raid5_bitmap_sector(struct mddev mddev, sector_t offset,
5898	unsigned long *sectors)
5899	{
5900	struct r5conf *conf = mddev->private;
5901	sector_t start = *offset;
5902	sector_t end = start + *sectors;
5903	sector_t prev_start = start;
5904	sector_t prev_end = end;
5905	int sectors_per_chunk;
5906	enum reshape_loc loc;
5907	int dd_idx;
5908
5909	sectors_per_chunk = conf->chunk_sectors *
5910	(conf->raid_disks - conf->max_degraded);
5911	start = round_down(start, sectors_per_chunk);
5912	end = round_up(end, sectors_per_chunk);
5913
5914	start = raid5_compute_sector(conf, r_sector: start, previous: `0`, dd_idx: &dd_idx, NULL);
5915	end = raid5_compute_sector(conf, r_sector: end, previous: `0`, dd_idx: &dd_idx, NULL);
5916
5917	/*
5918	* For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make
5919	* progress, hence it's the same as LOC_BEHIND_RESHAPE.
5920	*/
5921	loc = get_reshape_loc(mddev, conf, logical_sector: prev_start);
5922	if (likely(loc != LOC_AHEAD_OF_RESHAPE)) {
5923	*offset = start;
5924	*sectors = end - start;
5925	return;
5926	}
5927
5928	sectors_per_chunk = conf->prev_chunk_sectors *
5929	(conf->previous_raid_disks - conf->max_degraded);
5930	prev_start = round_down(prev_start, sectors_per_chunk);
5931	prev_end = round_down(prev_end, sectors_per_chunk);
5932
5933	prev_start = raid5_compute_sector(conf, r_sector: prev_start, previous: `1`, dd_idx: &dd_idx, NULL);
5934	prev_end = raid5_compute_sector(conf, r_sector: prev_end, previous: `1`, dd_idx: &dd_idx, NULL);
5935
5936	/*
5937	* for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO
5938	* is handled in make_stripe_request(), we can't know this here hence
5939	* we set bits for both.
5940	*/
5941	*offset = min(start, prev_start);
5942	sectors = max(end, prev_end) - offset;
5943	}
5944
5945	static enum stripe_result make_stripe_request(struct mddev *mddev,
5946	struct r5conf conf, struct* stripe_request_ctx *ctx,
5947	sector_t logical_sector, struct bio *bi)
5948	{
5949	const int rw = bio_data_dir(bi);
5950	enum stripe_result ret;
5951	struct stripe_head *sh;
5952	enum reshape_loc loc;
5953	sector_t new_sector;
5954	int previous = `0`, flags = `0`;
5955	int seq, dd_idx;
5956
5957	seq = read_seqcount_begin(&conf->gen_lock);
5958	loc = get_reshape_loc(mddev, conf, logical_sector);
5959	if (loc == LOC_INSIDE_RESHAPE) {
5960	ret = STRIPE_SCHEDULE_AND_RETRY;
5961	goto out;
5962	}
5963	if (loc == LOC_AHEAD_OF_RESHAPE)
5964	previous = `1`;
5965
5966	new_sector = raid5_compute_sector(conf, r_sector: logical_sector, previous,
5967	dd_idx: &dd_idx, NULL);
5968	pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
5969	new_sector, logical_sector);
5970
5971	if (previous)
5972	flags \|= R5_GAS_PREVIOUS;
5973	if (bi->bi_opf & REQ_RAHEAD)
5974	flags \|= R5_GAS_NOBLOCK;
5975	sh = raid5_get_active_stripe(conf, ctx, sector: new_sector, flags);
5976	if (unlikely(!sh)) {
5977	/ cannot get stripe, just give-up /
5978	bi->bi_status = BLK_STS_IOERR;
5979	return STRIPE_FAIL;
5980	}
5981
5982	if (unlikely(previous) &&
5983	stripe_ahead_of_reshape(mddev, conf, sh)) {
5984	/*
5985	* Expansion moved on while waiting for a stripe.
5986	* Expansion could still move past after this
5987	* test, but as we are holding a reference to
5988	* 'sh', we know that if that happens,
5989	* STRIPE_EXPANDING will get set and the expansion
5990	* won't proceed until we finish with the stripe.
5991	*/
5992	ret = STRIPE_SCHEDULE_AND_RETRY;
5993	goto out_release;
5994	}
5995
5996	if (read_seqcount_retry(&conf->gen_lock, seq)) {
5997	/ Might have got the wrong stripe_head by accident /
5998	ret = STRIPE_RETRY;
5999	goto out_release;
6000	}
6001
6002	if (test_bit(STRIPE_EXPANDING, &sh->state)) {
6003	md_wakeup_thread(mddev->thread);
6004	ret = STRIPE_SCHEDULE_AND_RETRY;
6005	goto out_release;
6006	}
6007
6008	if (!add_all_stripe_bios(conf, ctx, sh, bi, forwrite: rw, previous)) {
6009	ret = STRIPE_RETRY;
6010	goto out;
6011	}
6012
6013	if (stripe_can_batch(sh)) {
6014	stripe_add_to_batch_list(conf, sh, last_sh: ctx->batch_last);
6015	if (ctx->batch_last)
6016	raid5_release_stripe(sh: ctx->batch_last);
6017	atomic_inc(v: &sh->count);
6018	ctx->batch_last = sh;
6019	}
6020
6021	if (ctx->do_flush) {
6022	set_bit(nr: STRIPE_R5C_PREFLUSH, addr: &sh->state);
6023	/ we only need flush for one stripe /
6024	ctx->do_flush = false;
6025	}
6026
6027	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6028	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
6029	if ((!sh->batch_head \|\| sh == sh->batch_head) &&
6030	(bi->bi_opf & REQ_SYNC) &&
6031	!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
6032	atomic_inc(v: &conf->preread_active_stripes);
6033
6034	release_stripe_plug(mddev, sh);
6035	return STRIPE_SUCCESS;
6036
6037	out_release:
6038	raid5_release_stripe(sh);
6039	out:
6040	if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
6041	bi->bi_status = BLK_STS_RESOURCE;
6042	ret = STRIPE_WAIT_RESHAPE;
6043	pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
6044	}
6045	return ret;
6046	}
6047
6048	/*
6049	* If the bio covers multiple data disks, find sector within the bio that has
6050	* the lowest chunk offset in the first chunk.
6051	*/
6052	static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
6053	struct bio *bi)
6054	{
6055	int sectors_per_chunk = conf->chunk_sectors;
6056	int raid_disks = conf->raid_disks;
6057	int dd_idx;
6058	struct stripe_head sh;
6059	unsigned int chunk_offset;
6060	sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6061	sector_t sector;
6062
6063	/ We pass in fake stripe_head to get back parity disk numbers /
6064	sector = raid5_compute_sector(conf, r_sector, previous: `0`, dd_idx: &dd_idx, sh: &sh);
6065	chunk_offset = sector_div(sector, sectors_per_chunk);
6066	if (sectors_per_chunk - chunk_offset >= bio_sectors(bi))
6067	return r_sector;
6068	/*
6069	* Bio crosses to the next data disk. Check whether it's in the same
6070	* chunk.
6071	*/
6072	dd_idx++;
6073	while (dd_idx == sh.pd_idx \|\| dd_idx == sh.qd_idx)
6074	dd_idx++;
6075	if (dd_idx >= raid_disks)
6076	return r_sector;
6077	return r_sector + sectors_per_chunk - chunk_offset;
6078	}
6079
6080	static bool raid5_make_request(struct mddev mddev, struct* bio * bi)
6081	{
6082	DEFINE_WAIT_FUNC(wait, woken_wake_function);
6083	bool on_wq;
6084	struct r5conf *conf = mddev->private;
6085	sector_t logical_sector;
6086	struct stripe_request_ctx ctx = {};
6087	const int rw = bio_data_dir(bi);
6088	enum stripe_result res;
6089	int s, stripe_cnt;
6090
6091	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
6092	int ret = log_handle_flush_request(conf, bio: bi);
6093
6094	if (ret == `0`)
6095	return true;
6096	if (ret == -ENODEV) {
6097	if (md_flush_request(mddev, bio: bi))
6098	return true;
6099	}
6100	/ ret == -EAGAIN, fallback /
6101	/*
6102	* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
6103	* we need to flush journal device
6104	*/
6105	ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
6106	}
6107
6108	md_write_start(mddev, bi);
6109	/*
6110	* If array is degraded, better not do chunk aligned read because
6111	* later we might have to read it again in order to reconstruct
6112	* data on failed drives.
6113	*/
6114	if (rw == READ && mddev->degraded == `0` &&
6115	mddev->reshape_position == MaxSector) {
6116	bi = chunk_aligned_read(mddev, raid_bio: bi);
6117	if (!bi)
6118	return true;
6119	}
6120
6121	if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
6122	make_discard_request(mddev, bi);
6123	md_write_end(mddev);
6124	return true;
6125	}
6126
6127	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6128	ctx.first_sector = logical_sector;
6129	ctx.last_sector = bio_end_sector(bi);
6130	bi->bi_next = NULL;
6131
6132	stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
6133	RAID5_STRIPE_SECTORS(conf));
6134	bitmap_set(map: ctx.sectors_to_do, start: `0`, nbits: stripe_cnt);
6135
6136	pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
6137	bi->bi_iter.bi_sector, ctx.last_sector);
6138
6139	/ Bail out if conflicts with reshape and REQ_NOWAIT is set /
6140	if ((bi->bi_opf & REQ_NOWAIT) &&
6141	get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) {
6142	bio_wouldblock_error(bio: bi);
6143	if (rw == WRITE)
6144	md_write_end(mddev);
6145	return true;
6146	}
6147	md_account_bio(mddev, bio: &bi);
6148
6149	/*
6150	* Lets start with the stripe with the lowest chunk offset in the first
6151	* chunk. That has the best chances of creating IOs adjacent to
6152	* previous IOs in case of sequential IO and thus creates the most
6153	* sequential IO pattern. We don't bother with the optimization when
6154	* reshaping as the performance benefit is not worth the complexity.
6155	*/
6156	if (likely(conf->reshape_progress == MaxSector)) {
6157	logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
6158	on_wq = false;
6159	} else {
6160	add_wait_queue(wq_head: &conf->wait_for_reshape, wq_entry: &wait);
6161	on_wq = true;
6162	}
6163	s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
6164
6165	while (`1`) {
6166	res = make_stripe_request(mddev, conf, ctx: &ctx, logical_sector,
6167	bi);
6168	if (res == STRIPE_FAIL \|\| res == STRIPE_WAIT_RESHAPE)
6169	break;
6170
6171	if (res == STRIPE_RETRY)
6172	continue;
6173
6174	if (res == STRIPE_SCHEDULE_AND_RETRY) {
6175	WARN_ON_ONCE(!on_wq);
6176	/*
6177	* Must release the reference to batch_last before
6178	* scheduling and waiting for work to be done,
6179	* otherwise the batch_last stripe head could prevent
6180	* raid5_activate_delayed() from making progress
6181	* and thus deadlocking.
6182	*/
6183	if (ctx.batch_last) {
6184	raid5_release_stripe(sh: ctx.batch_last);
6185	ctx.batch_last = NULL;
6186	}
6187
6188	wait_woken(wq_entry: &wait, TASK_UNINTERRUPTIBLE,
6189	MAX_SCHEDULE_TIMEOUT);
6190	continue;
6191	}
6192
6193	s = find_next_bit_wrap(addr: ctx.sectors_to_do, size: stripe_cnt, offset: s);
6194	if (s == stripe_cnt)
6195	break;
6196
6197	logical_sector = ctx.first_sector +
6198	(s << RAID5_STRIPE_SHIFT(conf));
6199	}
6200	if (unlikely(on_wq))
6201	remove_wait_queue(wq_head: &conf->wait_for_reshape, wq_entry: &wait);
6202
6203	if (ctx.batch_last)
6204	raid5_release_stripe(sh: ctx.batch_last);
6205
6206	if (rw == WRITE)
6207	md_write_end(mddev);
6208	if (res == STRIPE_WAIT_RESHAPE) {
6209	md_free_cloned_bio(bio: bi);
6210	return false;
6211	}
6212
6213	bio_endio(bi);
6214	return true;
6215	}
6216
6217	static sector_t raid5_size(struct mddev mddev, sector_t sectors, int* raid_disks);
6218
6219	static sector_t reshape_request(struct mddev mddev, sector_t sector_nr, int* *skipped)
6220	{
6221	/ reshaping is quite different to recovery/resync so it is*
6222	* handled quite separately ... here.
6223	*
6224	* On each call to sync_request, we gather one chunk worth of
6225	* destination stripes and flag them as expanding.
6226	* Then we find all the source stripes and request reads.
6227	* As the reads complete, handle_stripe will copy the data
6228	* into the destination stripe and release that stripe.
6229	*/
6230	struct r5conf *conf = mddev->private;
6231	struct stripe_head *sh;
6232	struct md_rdev *rdev;
6233	sector_t first_sector, last_sector;
6234	int raid_disks = conf->previous_raid_disks;
6235	int data_disks = raid_disks - conf->max_degraded;
6236	int new_data_disks = conf->raid_disks - conf->max_degraded;
6237	int i;
6238	int dd_idx;
6239	sector_t writepos, readpos, safepos;
6240	sector_t stripe_addr;
6241	int reshape_sectors;
6242	struct list_head stripes;
6243	sector_t retn;
6244
6245	if (sector_nr == `0`) {
6246	/ If restarting in the middle, skip the initial sectors /
6247	if (mddev->reshape_backwards &&
6248	conf->reshape_progress < raid5_size(mddev, sectors: `0`, raid_disks: `0`)) {
6249	sector_nr = raid5_size(mddev, sectors: `0`, raid_disks: `0`)
6250	- conf->reshape_progress;
6251	} else if (mddev->reshape_backwards &&
6252	conf->reshape_progress == MaxSector) {
6253	/ shouldn't happen, but just in case, finish up./
6254	sector_nr = MaxSector;
6255	} else if (!mddev->reshape_backwards &&
6256	conf->reshape_progress > `0`)
6257	sector_nr = conf->reshape_progress;
6258	sector_div(sector_nr, new_data_disks);
6259	if (sector_nr) {
6260	mddev->curr_resync_completed = sector_nr;
6261	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6262	*skipped = `1`;
6263	retn = sector_nr;
6264	goto finish;
6265	}
6266	}
6267
6268	/ We need to process a full chunk at a time.*
6269	* If old and new chunk sizes differ, we need to process the
6270	* largest of these
6271	*/
6272
6273	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6274
6275	/ We update the metadata at least every 10 seconds, or when*
6276	* the data about to be copied would over-write the source of
6277	* the data at the front of the range. i.e. one new_stripe
6278	* along from reshape_progress new_maps to after where
6279	* reshape_safe old_maps to
6280	*/
6281	writepos = conf->reshape_progress;
6282	sector_div(writepos, new_data_disks);
6283	readpos = conf->reshape_progress;
6284	sector_div(readpos, data_disks);
6285	safepos = conf->reshape_safe;
6286	sector_div(safepos, data_disks);
6287	if (mddev->reshape_backwards) {
6288	if (WARN_ON(writepos < reshape_sectors))
6289	return MaxSector;
6290
6291	writepos -= reshape_sectors;
6292	readpos += reshape_sectors;
6293	safepos += reshape_sectors;
6294	} else {
6295	writepos += reshape_sectors;
6296	/ readpos and safepos are worst-case calculations.*
6297	* A negative number is overly pessimistic, and causes
6298	* obvious problems for unsigned storage. So clip to 0.
6299	*/
6300	readpos -= min_t(sector_t, reshape_sectors, readpos);
6301	safepos -= min_t(sector_t, reshape_sectors, safepos);
6302	}
6303
6304	/ Having calculated the 'writepos' possibly use it*
6305	* to set 'stripe_addr' which is where we will write to.
6306	*/
6307	if (mddev->reshape_backwards) {
6308	if (WARN_ON(conf->reshape_progress == `0`))
6309	return MaxSector;
6310
6311	stripe_addr = writepos;
6312	if (WARN_ON((mddev->dev_sectors &
6313	~((sector_t)reshape_sectors - `1`)) -
6314	reshape_sectors - stripe_addr != sector_nr))
6315	return MaxSector;
6316	} else {
6317	if (WARN_ON(writepos != sector_nr + reshape_sectors))
6318	return MaxSector;
6319
6320	stripe_addr = sector_nr;
6321	}
6322
6323	/ 'writepos' is the most advanced device address we might write.*
6324	* 'readpos' is the least advanced device address we might read.
6325	* 'safepos' is the least address recorded in the metadata as having
6326	* been reshaped.
6327	* If there is a min_offset_diff, these are adjusted either by
6328	* increasing the safepos/readpos if diff is negative, or
6329	* increasing writepos if diff is positive.
6330	* If 'readpos' is then behind 'writepos', there is no way that we can
6331	* ensure safety in the face of a crash - that must be done by userspace
6332	* making a backup of the data. So in that case there is no particular
6333	* rush to update metadata.
6334	* Otherwise if 'safepos' is behind 'writepos', then we really need to
6335	* update the metadata to advance 'safepos' to match 'readpos' so that
6336	* we can be safe in the event of a crash.
6337	* So we insist on updating metadata if safepos is behind writepos and
6338	* readpos is beyond writepos.
6339	* In any case, update the metadata every 10 seconds.
6340	* Maybe that number should be configurable, but I'm not sure it is
6341	* worth it.... maybe it could be a multiple of safemode_delay???
6342	*/
6343	if (conf->min_offset_diff < `0`) {
6344	safepos += -conf->min_offset_diff;
6345	readpos += -conf->min_offset_diff;
6346	} else
6347	writepos += conf->min_offset_diff;
6348
6349	if ((mddev->reshape_backwards
6350	? (safepos > writepos && readpos < writepos)
6351	: (safepos < writepos && readpos > writepos)) \|\|
6352	time_after(jiffies, conf->reshape_checkpoint + `10`*HZ)) {
6353	/ Cannot proceed until we've updated the superblock... /
6354	wait_event(conf->wait_for_reshape,
6355	atomic_read(&conf->reshape_stripes)==`0`
6356	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6357	if (atomic_read(v: &conf->reshape_stripes) != `0`)
6358	return `0`;
6359	mddev->reshape_position = conf->reshape_progress;
6360	mddev->curr_resync_completed = sector_nr;
6361	if (!mddev->reshape_backwards)
6362	/ Can update recovery_offset /
6363	rdev_for_each(rdev, mddev)
6364	if (rdev->raid_disk >= `0` &&
6365	!test_bit(Journal, &rdev->flags) &&
6366	!test_bit(In_sync, &rdev->flags) &&
6367	rdev->recovery_offset < sector_nr)
6368	rdev->recovery_offset = sector_nr;
6369
6370	conf->reshape_checkpoint = jiffies;
6371	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
6372	md_wakeup_thread(mddev->thread);
6373	wait_event(mddev->sb_wait, mddev->sb_flags == `0` \|\|
6374	test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6375	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6376	return `0`;
6377	spin_lock_irq(lock: &conf->device_lock);
6378	conf->reshape_safe = mddev->reshape_position;
6379	spin_unlock_irq(lock: &conf->device_lock);
6380	wake_up(&conf->wait_for_reshape);
6381	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6382	}
6383
6384	INIT_LIST_HEAD(list: &stripes);
6385	for (i = `0`; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6386	int j;
6387	int skipped_disk = `0`;
6388	sh = raid5_get_active_stripe(conf, NULL, sector: stripe_addr+i,
6389	R5_GAS_NOQUIESCE);
6390	set_bit(nr: STRIPE_EXPANDING, addr: &sh->state);
6391	atomic_inc(v: &conf->reshape_stripes);
6392	/ If any of this stripe is beyond the end of the old*
6393	* array, then we need to zero those blocks
6394	*/
6395	for (j=sh->disks; j--;) {
6396	sector_t s;
6397	if (j == sh->pd_idx)
6398	continue;
6399	if (conf->level == `6` &&
6400	j == sh->qd_idx)
6401	continue;
6402	s = raid5_compute_blocknr(sh, i: j, previous: `0`);
6403	if (s < raid5_size(mddev, sectors: `0`, raid_disks: `0`)) {
6404	skipped_disk = `1`;
6405	continue;
6406	}
6407	memset(page_address(sh->dev[j].page), `0`, RAID5_STRIPE_SIZE(conf));
6408	set_bit(nr: R5_Expanded, addr: &sh->dev[j].flags);
6409	set_bit(nr: R5_UPTODATE, addr: &sh->dev[j].flags);
6410	}
6411	if (!skipped_disk) {
6412	set_bit(nr: STRIPE_EXPAND_READY, addr: &sh->state);
6413	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6414	}
6415	list_add(new: &sh->lru, head: &stripes);
6416	}
6417	spin_lock_irq(lock: &conf->device_lock);
6418	if (mddev->reshape_backwards)
6419	conf->reshape_progress -= reshape_sectors * new_data_disks;
6420	else
6421	conf->reshape_progress += reshape_sectors * new_data_disks;
6422	spin_unlock_irq(lock: &conf->device_lock);
6423	/ Ok, those stripe are ready. We can start scheduling*
6424	* reads on the source stripes.
6425	* The source stripes are determined by mapping the first and last
6426	* block on the destination stripes.
6427	*/
6428	first_sector =
6429	raid5_compute_sector(conf, r_sector: stripe_addr*(new_data_disks),
6430	previous: `1`, dd_idx: &dd_idx, NULL);
6431	last_sector =
6432	raid5_compute_sector(conf, r_sector: ((stripe_addr+reshape_sectors)
6433	* new_data_disks - `1`),
6434	previous: `1`, dd_idx: &dd_idx, NULL);
6435	if (last_sector >= mddev->dev_sectors)
6436	last_sector = mddev->dev_sectors - `1`;
6437	while (first_sector <= last_sector) {
6438	sh = raid5_get_active_stripe(conf, NULL, sector: first_sector,
6439	R5_GAS_PREVIOUS \| R5_GAS_NOQUIESCE);
6440	set_bit(nr: STRIPE_EXPAND_SOURCE, addr: &sh->state);
6441	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6442	raid5_release_stripe(sh);
6443	first_sector += RAID5_STRIPE_SECTORS(conf);
6444	}
6445	/ Now that the sources are clearly marked, we can release*
6446	* the destination stripes
6447	*/
6448	while (!list_empty(head: &stripes)) {
6449	sh = list_entry(stripes.next, struct stripe_head, lru);
6450	list_del_init(entry: &sh->lru);
6451	raid5_release_stripe(sh);
6452	}
6453	/ If this takes us to the resync_max point where we have to pause,*
6454	* then we need to write out the superblock.
6455	*/
6456	sector_nr += reshape_sectors;
6457	retn = reshape_sectors;
6458	finish:
6459	if (mddev->curr_resync_completed > mddev->resync_max \|\|
6460	(sector_nr - mddev->curr_resync_completed) * `2`
6461	>= mddev->resync_max - mddev->curr_resync_completed) {
6462	/ Cannot proceed until we've updated the superblock... /
6463	wait_event(conf->wait_for_reshape,
6464	atomic_read(&conf->reshape_stripes) == `0`
6465	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6466	if (atomic_read(v: &conf->reshape_stripes) != `0`)
6467	goto ret;
6468	mddev->reshape_position = conf->reshape_progress;
6469	mddev->curr_resync_completed = sector_nr;
6470	if (!mddev->reshape_backwards)
6471	/ Can update recovery_offset /
6472	rdev_for_each(rdev, mddev)
6473	if (rdev->raid_disk >= `0` &&
6474	!test_bit(Journal, &rdev->flags) &&
6475	!test_bit(In_sync, &rdev->flags) &&
6476	rdev->recovery_offset < sector_nr)
6477	rdev->recovery_offset = sector_nr;
6478	conf->reshape_checkpoint = jiffies;
6479	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
6480	md_wakeup_thread(mddev->thread);
6481	wait_event(mddev->sb_wait,
6482	!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6483	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6484	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6485	goto ret;
6486	spin_lock_irq(lock: &conf->device_lock);
6487	conf->reshape_safe = mddev->reshape_position;
6488	spin_unlock_irq(lock: &conf->device_lock);
6489	wake_up(&conf->wait_for_reshape);
6490	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6491	}
6492	ret:
6493	return retn;
6494	}
6495
6496	static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6497	sector_t max_sector, int *skipped)
6498	{
6499	struct r5conf *conf = mddev->private;
6500	struct stripe_head *sh;
6501	sector_t sync_blocks;
6502	bool still_degraded = false;
6503	int i;
6504
6505	if (sector_nr >= max_sector) {
6506	/ just being told to finish up .. nothing much to do /
6507
6508	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6509	end_reshape(conf);
6510	return `0`;
6511	}
6512
6513	if (mddev->curr_resync < max_sector) / aborted /
6514	md_bitmap_end_sync(mddev, offset: mddev->curr_resync,
6515	blocks: &sync_blocks);
6516	else / completed sync /
6517	conf->fullsync = `0`;
6518	if (md_bitmap_enabled(mddev, flush: false))
6519	mddev->bitmap_ops->close_sync(mddev);
6520
6521	return `0`;
6522	}
6523
6524	/ Allow raid5_quiesce to complete /
6525	wait_event(conf->wait_for_reshape, conf->quiesce != `2`);
6526
6527	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6528	return reshape_request(mddev, sector_nr, skipped);
6529
6530	/ No need to check resync_max as we never do more than one*
6531	* stripe, and as resync_max will always be on a chunk boundary,
6532	* if the check in md_do_sync didn't fire, there is no chance
6533	* of overstepping resync_max here
6534	*/
6535
6536	/ if there is too many failed drives and we are trying*
6537	* to resync, then assert that we are finished, because there is
6538	* nothing we can do.
6539	*/
6540	if (mddev->degraded >= conf->max_degraded &&
6541	test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6542	sector_t rv = mddev->dev_sectors - sector_nr;
6543	*skipped = `1`;
6544	return rv;
6545	}
6546	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6547	!conf->fullsync &&
6548	!md_bitmap_start_sync(mddev, offset: sector_nr, blocks: &sync_blocks, degraded: true) &&
6549	sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6550	/ we can skip this block, and probably more /
6551	do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6552	*skipped = `1`;
6553	/ keep things rounded to whole stripes /
6554	return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6555	}
6556
6557	if (md_bitmap_enabled(mddev, flush: false))
6558	mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
6559
6560	sh = raid5_get_active_stripe(conf, NULL, sector: sector_nr,
6561	R5_GAS_NOBLOCK);
6562	if (sh == NULL) {
6563	sh = raid5_get_active_stripe(conf, NULL, sector: sector_nr, flags: `0`);
6564	/ make sure we don't swamp the stripe cache if someone else*
6565	* is trying to get access
6566	*/
6567	schedule_timeout_uninterruptible(timeout: `1`);
6568	}
6569	/ Need to check if array will still be degraded after recovery/resync*
6570	* Note in case of > 1 drive failures it's possible we're rebuilding
6571	* one drive while leaving another faulty drive in array.
6572	*/
6573	for (i = `0`; i < conf->raid_disks; i++) {
6574	struct md_rdev *rdev = conf->disks[i].rdev;
6575
6576	if (rdev == NULL \|\| test_bit(Faulty, &rdev->flags))
6577	still_degraded = true;
6578	}
6579
6580	md_bitmap_start_sync(mddev, offset: sector_nr, blocks: &sync_blocks, degraded: still_degraded);
6581	set_bit(nr: STRIPE_SYNC_REQUESTED, addr: &sh->state);
6582	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6583
6584	raid5_release_stripe(sh);
6585
6586	return RAID5_STRIPE_SECTORS(conf);
6587	}
6588
6589	static int retry_aligned_read(struct r5conf conf, struct* bio *raid_bio,
6590	unsigned int offset)
6591	{
6592	/ We may not be able to submit a whole bio at once as there*
6593	* may not be enough stripe_heads available.
6594	* We cannot pre-allocate enough stripe_heads as we may need
6595	* more than exist in the cache (if we allow ever large chunks).
6596	* So we do one stripe head at a time and record in
6597	* ->bi_hw_segments how many have been done.
6598	*
6599	* We know that this entire raid_bio is in one chunk, so
6600	* it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
6601	*/
6602	struct stripe_head *sh;
6603	int dd_idx;
6604	sector_t sector, logical_sector, last_sector;
6605	int scnt = `0`;
6606	int handled = `0`;
6607
6608	logical_sector = raid_bio->bi_iter.bi_sector &
6609	~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6610	sector = raid5_compute_sector(conf, r_sector: logical_sector,
6611	previous: `0`, dd_idx: &dd_idx, NULL);
6612	last_sector = bio_end_sector(raid_bio);
6613
6614	for (; logical_sector < last_sector;
6615	logical_sector += RAID5_STRIPE_SECTORS(conf),
6616	sector += RAID5_STRIPE_SECTORS(conf),
6617	scnt++) {
6618
6619	if (scnt < offset)
6620	/ already done this stripe /
6621	continue;
6622
6623	sh = raid5_get_active_stripe(conf, NULL, sector,
6624	R5_GAS_NOBLOCK \| R5_GAS_NOQUIESCE);
6625	if (!sh) {
6626	/ failed to get a stripe - must wait /
6627	conf->retry_read_aligned = raid_bio;
6628	conf->retry_read_offset = scnt;
6629	return handled;
6630	}
6631
6632	if (!add_stripe_bio(sh, bi: raid_bio, dd_idx, forwrite: `0`, previous: `0`)) {
6633	raid5_release_stripe(sh);
6634	conf->retry_read_aligned = raid_bio;
6635	conf->retry_read_offset = scnt;
6636	return handled;
6637	}
6638
6639	set_bit(nr: R5_ReadNoMerge, addr: &sh->dev[dd_idx].flags);
6640	handle_stripe(sh);
6641	raid5_release_stripe(sh);
6642	handled++;
6643	}
6644
6645	bio_endio(raid_bio);
6646
6647	if (atomic_dec_and_test(v: &conf->active_aligned_reads))
6648	wake_up(&conf->wait_for_quiescent);
6649	return handled;
6650	}
6651
6652	static int handle_active_stripes(struct r5conf conf, int* group,
6653	struct r5worker *worker,
6654	struct list_head *temp_inactive_list)
6655	__must_hold(&conf->device_lock)
6656	{
6657	struct stripe_head batch[MAX_STRIPE_BATCH], sh;
6658	int i, batch_size = `0`, hash;
6659	bool release_inactive = false;
6660
6661	while (batch_size < MAX_STRIPE_BATCH &&
6662	(sh = __get_priority_stripe(conf, group)) != NULL)
6663	batch[batch_size++] = sh;
6664
6665	if (batch_size == `0`) {
6666	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
6667	if (!list_empty(head: temp_inactive_list + i))
6668	break;
6669	if (i == NR_STRIPE_HASH_LOCKS) {
6670	spin_unlock_irq(lock: &conf->device_lock);
6671	log_flush_stripe_to_raid(conf);
6672	spin_lock_irq(lock: &conf->device_lock);
6673	return batch_size;
6674	}
6675	release_inactive = true;
6676	}
6677	spin_unlock_irq(lock: &conf->device_lock);
6678
6679	release_inactive_stripe_list(conf, temp_inactive_list,
6680	NR_STRIPE_HASH_LOCKS);
6681
6682	r5l_flush_stripe_to_raid(log: conf->log);
6683	if (release_inactive) {
6684	spin_lock_irq(lock: &conf->device_lock);
6685	return `0`;
6686	}
6687
6688	for (i = `0`; i < batch_size; i++)
6689	handle_stripe(sh: batch[i]);
6690	log_write_stripe_run(conf);
6691
6692	cond_resched();
6693
6694	spin_lock_irq(lock: &conf->device_lock);
6695	for (i = `0`; i < batch_size; i++) {
6696	hash = batch[i]->hash_lock_index;
6697	__release_stripe(conf, sh: batch[i], temp_inactive_list: &temp_inactive_list[hash]);
6698	}
6699	return batch_size;
6700	}
6701
6702	static void raid5_do_work(struct work_struct *work)
6703	{
6704	struct r5worker worker = container_of(work, struct* r5worker, work);
6705	struct r5worker_group *group = worker->group;
6706	struct r5conf *conf = group->conf;
6707	struct mddev *mddev = conf->mddev;
6708	int group_id = group - conf->worker_groups;
6709	int handled;
6710	struct blk_plug plug;
6711
6712	pr_debug("+++ raid5worker active\n");
6713
6714	blk_start_plug(&plug);
6715	handled = `0`;
6716	spin_lock_irq(lock: &conf->device_lock);
6717	while (`1`) {
6718	int batch_size, released;
6719
6720	released = release_stripe_list(conf, temp_inactive_list: worker->temp_inactive_list);
6721
6722	batch_size = handle_active_stripes(conf, group: group_id, worker,
6723	temp_inactive_list: worker->temp_inactive_list);
6724	worker->working = false;
6725	if (!batch_size && !released)
6726	break;
6727	handled += batch_size;
6728	wait_event_lock_irq(mddev->sb_wait,
6729	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6730	conf->device_lock);
6731	}
6732	pr_debug("%d stripes handled\n", handled);
6733
6734	spin_unlock_irq(lock: &conf->device_lock);
6735
6736	flush_deferred_bios(conf);
6737
6738	r5l_flush_stripe_to_raid(log: conf->log);
6739
6740	async_tx_issue_pending_all();
6741	blk_finish_plug(&plug);
6742
6743	pr_debug("--- raid5worker inactive\n");
6744	}
6745
6746	/*
6747	* This is our raid5 kernel thread.
6748	*
6749	* We scan the hash table for stripes which can be handled now.
6750	* During the scan, completed stripes are saved for us by the interrupt
6751	* handler, so that they will not have to wait for our next wakeup.
6752	*/
6753	static void raid5d(struct md_thread *thread)
6754	{
6755	struct mddev *mddev = thread->mddev;
6756	struct r5conf *conf = mddev->private;
6757	int handled;
6758	struct blk_plug plug;
6759
6760	pr_debug("+++ raid5d active\n");
6761
6762	md_check_recovery(mddev);
6763
6764	blk_start_plug(&plug);
6765	handled = `0`;
6766	spin_lock_irq(lock: &conf->device_lock);
6767	while (`1`) {
6768	struct bio *bio;
6769	int batch_size, released;
6770	unsigned int offset;
6771
6772	if (md_is_rdwr(mddev) &&
6773	test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6774	break;
6775
6776	released = release_stripe_list(conf, temp_inactive_list: conf->temp_inactive_list);
6777	if (released)
6778	clear_bit(nr: R5_DID_ALLOC, addr: &conf->cache_state);
6779
6780	if (
6781	!list_empty(head: &conf->bitmap_list)) {
6782	/ Now is a good time to flush some bitmap updates /
6783	conf->seq_flush++;
6784	spin_unlock_irq(lock: &conf->device_lock);
6785	if (md_bitmap_enabled(mddev, flush: true))
6786	mddev->bitmap_ops->unplug(mddev, true);
6787	spin_lock_irq(lock: &conf->device_lock);
6788	conf->seq_write = conf->seq_flush;
6789	activate_bit_delay(conf, temp_inactive_list: conf->temp_inactive_list);
6790	}
6791	raid5_activate_delayed(conf);
6792
6793	while ((bio = remove_bio_from_retry(conf, offset: &offset))) {
6794	int ok;
6795	spin_unlock_irq(lock: &conf->device_lock);
6796	ok = retry_aligned_read(conf, raid_bio: bio, offset);
6797	spin_lock_irq(lock: &conf->device_lock);
6798	if (!ok)
6799	break;
6800	handled++;
6801	}
6802
6803	batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6804	temp_inactive_list: conf->temp_inactive_list);
6805	if (!batch_size && !released)
6806	break;
6807	handled += batch_size;
6808
6809	if (mddev->sb_flags & ~(`1` << MD_SB_CHANGE_PENDING)) {
6810	spin_unlock_irq(lock: &conf->device_lock);
6811	md_check_recovery(mddev);
6812	spin_lock_irq(lock: &conf->device_lock);
6813	}
6814	}
6815	pr_debug("%d stripes handled\n", handled);
6816
6817	spin_unlock_irq(lock: &conf->device_lock);
6818	if (test_and_clear_bit(nr: R5_ALLOC_MORE, addr: &conf->cache_state) &&
6819	mutex_trylock(&conf->cache_size_mutex)) {
6820	grow_one_stripe(conf, __GFP_NOWARN);
6821	/ Set flag even if allocation failed. This helps*
6822	* slow down allocation requests when mem is short
6823	*/
6824	set_bit(nr: R5_DID_ALLOC, addr: &conf->cache_state);
6825	mutex_unlock(lock: &conf->cache_size_mutex);
6826	}
6827
6828	flush_deferred_bios(conf);
6829
6830	r5l_flush_stripe_to_raid(log: conf->log);
6831
6832	async_tx_issue_pending_all();
6833	blk_finish_plug(&plug);
6834
6835	pr_debug("--- raid5d inactive\n");
6836	}
6837
6838	static ssize_t
6839	raid5_show_stripe_cache_size(struct mddev mddev, char* *page)
6840	{
6841	struct r5conf *conf;
6842	int ret = `0`;
6843	spin_lock(lock: &mddev->lock);
6844	conf = mddev->private;
6845	if (conf)
6846	ret = sprintf(buf: page, fmt: "%d\n", conf->min_nr_stripes);
6847	spin_unlock(lock: &mddev->lock);
6848	return ret;
6849	}
6850
6851	int
6852	raid5_set_cache_size(struct mddev mddev, int* size)
6853	{
6854	int result = `0`;
6855	struct r5conf *conf = mddev->private;
6856
6857	if (size <= `16` \|\| size > `32768`)
6858	return -EINVAL;
6859
6860	WRITE_ONCE(conf->min_nr_stripes, size);
6861	mutex_lock(&conf->cache_size_mutex);
6862	while (size < conf->max_nr_stripes &&
6863	drop_one_stripe(conf))
6864	;
6865	mutex_unlock(lock: &conf->cache_size_mutex);
6866
6867	md_allow_write(mddev);
6868
6869	mutex_lock(&conf->cache_size_mutex);
6870	while (size > conf->max_nr_stripes)
6871	if (!grow_one_stripe(conf, GFP_KERNEL)) {
6872	WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
6873	result = -ENOMEM;
6874	break;
6875	}
6876	mutex_unlock(lock: &conf->cache_size_mutex);
6877
6878	return result;
6879	}
6880	EXPORT_SYMBOL(raid5_set_cache_size);
6881
6882	static ssize_t
6883	raid5_store_stripe_cache_size(struct mddev mddev, const* char *page, size_t len)
6884	{
6885	struct r5conf *conf;
6886	unsigned long new;
6887	int err;
6888
6889	if (len >= PAGE_SIZE)
6890	return -EINVAL;
6891	if (kstrtoul(s: page, base: `10`, res: &new))
6892	return -EINVAL;
6893	err = mddev_lock(mddev);
6894	if (err)
6895	return err;
6896	conf = mddev->private;
6897	if (!conf)
6898	err = -ENODEV;
6899	else
6900	err = raid5_set_cache_size(mddev, new);
6901	mddev_unlock(mddev);
6902
6903	return err ?: len;
6904	}
6905
6906	static struct md_sysfs_entry
6907	raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO \| S_IWUSR,
6908	raid5_show_stripe_cache_size,
6909	raid5_store_stripe_cache_size);
6910
6911	static ssize_t
6912	raid5_show_rmw_level(struct mddev mddev, char* *page)
6913	{
6914	struct r5conf *conf = mddev->private;
6915	if (conf)
6916	return sprintf(buf: page, fmt: "%d\n", conf->rmw_level);
6917	else
6918	return `0`;
6919	}
6920
6921	static ssize_t
6922	raid5_store_rmw_level(struct mddev mddev, const* char *page, size_t len)
6923	{
6924	struct r5conf *conf = mddev->private;
6925	unsigned long new;
6926
6927	if (!conf)
6928	return -ENODEV;
6929
6930	if (len >= PAGE_SIZE)
6931	return -EINVAL;
6932
6933	if (kstrtoul(s: page, base: `10`, res: &new))
6934	return -EINVAL;
6935
6936	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6937	return -EINVAL;
6938
6939	if (new != PARITY_DISABLE_RMW &&
6940	new != PARITY_ENABLE_RMW &&
6941	new != PARITY_PREFER_RMW)
6942	return -EINVAL;
6943
6944	conf->rmw_level = new;
6945	return len;
6946	}
6947
6948	static struct md_sysfs_entry
6949	raid5_rmw_level = __ATTR(rmw_level, S_IRUGO \| S_IWUSR,
6950	raid5_show_rmw_level,
6951	raid5_store_rmw_level);
6952
6953	static ssize_t
6954	raid5_show_stripe_size(struct mddev mddev, char* *page)
6955	{
6956	struct r5conf *conf;
6957	int ret = `0`;
6958
6959	spin_lock(lock: &mddev->lock);
6960	conf = mddev->private;
6961	if (conf)
6962	ret = sprintf(buf: page, fmt: "%lu\n", RAID5_STRIPE_SIZE(conf));
6963	spin_unlock(lock: &mddev->lock);
6964	return ret;
6965	}
6966
6967	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6968	static ssize_t
6969	raid5_store_stripe_size(struct mddev mddev, const* char *page, size_t len)
6970	{
6971	struct r5conf *conf;
6972	unsigned long new;
6973	int err;
6974	int size;
6975
6976	if (len >= PAGE_SIZE)
6977	return -EINVAL;
6978	if (kstrtoul(page, `10`, &new))
6979	return -EINVAL;
6980
6981	/*
6982	* The value should not be bigger than PAGE_SIZE. It requires to
6983	* be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6984	* of two.
6985	*/
6986	if (new % DEFAULT_STRIPE_SIZE != `0` \|\|
6987	new > PAGE_SIZE \|\| new == `0` \|\|
6988	new != roundup_pow_of_two(new))
6989	return -EINVAL;
6990
6991	err = mddev_suspend_and_lock(mddev);
6992	if (err)
6993	return err;
6994
6995	conf = mddev->private;
6996	if (!conf) {
6997	err = -ENODEV;
6998	goto out_unlock;
6999	}
7000
7001	if (new == conf->stripe_size)
7002	goto out_unlock;
7003
7004	pr_debug("md/raid: change stripe_size from %lu to %lu\n",
7005	conf->stripe_size, new);
7006
7007	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) \|\|
7008	mddev->reshape_position != MaxSector \|\| mddev->sysfs_active) {
7009	err = -EBUSY;
7010	goto out_unlock;
7011	}
7012
7013	mutex_lock(&conf->cache_size_mutex);
7014	size = conf->max_nr_stripes;
7015
7016	shrink_stripes(conf);
7017
7018	conf->stripe_size = new;
7019	conf->stripe_shift = ilog2(new) - `9`;
7020	conf->stripe_sectors = new >> `9`;
7021	if (grow_stripes(conf, size)) {
7022	pr_warn("md/raid:%s: couldn't allocate buffers\n",
7023	mdname(mddev));
7024	err = -ENOMEM;
7025	}
7026	mutex_unlock(&conf->cache_size_mutex);
7027
7028	out_unlock:
7029	mddev_unlock_and_resume(mddev);
7030	return err ?: len;
7031	}
7032
7033	static struct md_sysfs_entry
7034	raid5_stripe_size = __ATTR(stripe_size, `0644`,
7035	raid5_show_stripe_size,
7036	raid5_store_stripe_size);
7037	#else
7038	static struct md_sysfs_entry
7039	raid5_stripe_size = __ATTR(stripe_size, `0444`,
7040	raid5_show_stripe_size,
7041	NULL);
7042	#endif
7043
7044	static ssize_t
7045	raid5_show_preread_threshold(struct mddev mddev, char* *page)
7046	{
7047	struct r5conf *conf;
7048	int ret = `0`;
7049	spin_lock(lock: &mddev->lock);
7050	conf = mddev->private;
7051	if (conf)
7052	ret = sprintf(buf: page, fmt: "%d\n", conf->bypass_threshold);
7053	spin_unlock(lock: &mddev->lock);
7054	return ret;
7055	}
7056
7057	static ssize_t
7058	raid5_store_preread_threshold(struct mddev mddev, const* char *page, size_t len)
7059	{
7060	struct r5conf *conf;
7061	unsigned long new;
7062	int err;
7063
7064	if (len >= PAGE_SIZE)
7065	return -EINVAL;
7066	if (kstrtoul(s: page, base: `10`, res: &new))
7067	return -EINVAL;
7068
7069	err = mddev_lock(mddev);
7070	if (err)
7071	return err;
7072	conf = mddev->private;
7073	if (!conf)
7074	err = -ENODEV;
7075	else if (new > conf->min_nr_stripes)
7076	err = -EINVAL;
7077	else
7078	conf->bypass_threshold = new;
7079	mddev_unlock(mddev);
7080	return err ?: len;
7081	}
7082
7083	static struct md_sysfs_entry
7084	raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
7085	S_IRUGO \| S_IWUSR,
7086	raid5_show_preread_threshold,
7087	raid5_store_preread_threshold);
7088
7089	static ssize_t
7090	raid5_show_skip_copy(struct mddev mddev, char* *page)
7091	{
7092	struct r5conf *conf;
7093	int ret = `0`;
7094	spin_lock(lock: &mddev->lock);
7095	conf = mddev->private;
7096	if (conf)
7097	ret = sprintf(buf: page, fmt: "%d\n", conf->skip_copy);
7098	spin_unlock(lock: &mddev->lock);
7099	return ret;
7100	}
7101
7102	static ssize_t
7103	raid5_store_skip_copy(struct mddev mddev, const* char *page, size_t len)
7104	{
7105	struct r5conf *conf;
7106	unsigned long new;
7107	int err;
7108
7109	if (len >= PAGE_SIZE)
7110	return -EINVAL;
7111	if (kstrtoul(s: page, base: `10`, res: &new))
7112	return -EINVAL;
7113	new = !!new;
7114
7115	err = mddev_suspend_and_lock(mddev);
7116	if (err)
7117	return err;
7118	conf = mddev->private;
7119	if (!conf)
7120	err = -ENODEV;
7121	else if (new != conf->skip_copy) {
7122	struct request_queue *q = mddev->gendisk->queue;
7123	struct queue_limits lim = queue_limits_start_update(q);
7124
7125	conf->skip_copy = new;
7126	if (new)
7127	lim.features \|= BLK_FEAT_STABLE_WRITES;
7128	else
7129	lim.features &= ~BLK_FEAT_STABLE_WRITES;
7130	err = queue_limits_commit_update(q, lim: &lim);
7131	}
7132	mddev_unlock_and_resume(mddev);
7133	return err ?: len;
7134	}
7135
7136	static struct md_sysfs_entry
7137	raid5_skip_copy = __ATTR(skip_copy, S_IRUGO \| S_IWUSR,
7138	raid5_show_skip_copy,
7139	raid5_store_skip_copy);
7140
7141	static ssize_t
7142	stripe_cache_active_show(struct mddev mddev, char* *page)
7143	{
7144	struct r5conf *conf = mddev->private;
7145	if (conf)
7146	return sprintf(buf: page, fmt: "%d\n", atomic_read(v: &conf->active_stripes));
7147	else
7148	return `0`;
7149	}
7150
7151	static struct md_sysfs_entry
7152	raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
7153
7154	static ssize_t
7155	raid5_show_group_thread_cnt(struct mddev mddev, char* *page)
7156	{
7157	struct r5conf *conf;
7158	int ret = `0`;
7159	spin_lock(lock: &mddev->lock);
7160	conf = mddev->private;
7161	if (conf)
7162	ret = sprintf(buf: page, fmt: "%d\n", conf->worker_cnt_per_group);
7163	spin_unlock(lock: &mddev->lock);
7164	return ret;
7165	}
7166
7167	static int alloc_thread_groups(struct r5conf conf, int* cnt,
7168	int *group_cnt,
7169	struct r5worker_group **worker_groups);
7170	static ssize_t
7171	raid5_store_group_thread_cnt(struct mddev mddev, const* char *page, size_t len)
7172	{
7173	struct r5conf *conf;
7174	unsigned int new;
7175	int err;
7176	struct r5worker_group new_groups, old_groups;
7177	int group_cnt;
7178
7179	if (len >= PAGE_SIZE)
7180	return -EINVAL;
7181	if (kstrtouint(s: page, base: `10`, res: &new))
7182	return -EINVAL;
7183	/ 8192 should be big enough /
7184	if (new > `8192`)
7185	return -EINVAL;
7186
7187	err = mddev_suspend_and_lock(mddev);
7188	if (err)
7189	return err;
7190	conf = mddev->private;
7191	if (!conf) {
7192	mddev_unlock_and_resume(mddev);
7193	return -ENODEV;
7194	}
7195	raid5_quiesce(mddev, quiesce: true);
7196
7197	if (new != conf->worker_cnt_per_group) {
7198	old_groups = conf->worker_groups;
7199	if (old_groups)
7200	flush_workqueue(raid5_wq);
7201
7202	err = alloc_thread_groups(conf, cnt: new, group_cnt: &group_cnt, worker_groups: &new_groups);
7203	if (!err) {
7204	spin_lock_irq(lock: &conf->device_lock);
7205	conf->group_cnt = group_cnt;
7206	conf->worker_cnt_per_group = new;
7207	conf->worker_groups = new_groups;
7208	spin_unlock_irq(lock: &conf->device_lock);
7209
7210	if (old_groups)
7211	kfree(objp: old_groups[`0`].workers);
7212	kfree(objp: old_groups);
7213	}
7214	}
7215
7216	raid5_quiesce(mddev, quiesce: false);
7217	mddev_unlock_and_resume(mddev);
7218
7219	return err ?: len;
7220	}
7221
7222	static struct md_sysfs_entry
7223	raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO \| S_IWUSR,
7224	raid5_show_group_thread_cnt,
7225	raid5_store_group_thread_cnt);
7226
7227	static struct attribute *raid5_attrs[] = {
7228	&raid5_stripecache_size.attr,
7229	&raid5_stripecache_active.attr,
7230	&raid5_preread_bypass_threshold.attr,
7231	&raid5_group_thread_cnt.attr,
7232	&raid5_skip_copy.attr,
7233	&raid5_rmw_level.attr,
7234	&raid5_stripe_size.attr,
7235	&r5c_journal_mode.attr,
7236	&ppl_write_hint.attr,
7237	NULL,
7238	};
7239	static const struct attribute_group raid5_attrs_group = {
7240	.name = NULL,
7241	.attrs = raid5_attrs,
7242	};
7243
7244	static int alloc_thread_groups(struct r5conf conf, int* cnt, int *group_cnt,
7245	struct r5worker_group **worker_groups)
7246	{
7247	int i, j, k;
7248	ssize_t size;
7249	struct r5worker *workers;
7250
7251	if (cnt == `0`) {
7252	*group_cnt = `0`;
7253	*worker_groups = NULL;
7254	return `0`;
7255	}
7256	*group_cnt = num_possible_nodes();
7257	size = sizeof(struct r5worker) * cnt;
7258	workers = kcalloc(size, *group_cnt, GFP_NOIO);
7259	worker_groups = kcalloc(group_cnt, sizeof(struct r5worker_group),
7260	GFP_NOIO);
7261	if (!*worker_groups \|\| !workers) {
7262	kfree(objp: workers);
7263	kfree(objp: *worker_groups);
7264	return -ENOMEM;
7265	}
7266
7267	for (i = `0`; i < *group_cnt; i++) {
7268	struct r5worker_group *group;
7269
7270	group = &(*worker_groups)[i];
7271	INIT_LIST_HEAD(list: &group->handle_list);
7272	INIT_LIST_HEAD(list: &group->loprio_list);
7273	group->conf = conf;
7274	group->workers = workers + i * cnt;
7275
7276	for (j = `0`; j < cnt; j++) {
7277	struct r5worker *worker = group->workers + j;
7278	worker->group = group;
7279	INIT_WORK(&worker->work, raid5_do_work);
7280
7281	for (k = `0`; k < NR_STRIPE_HASH_LOCKS; k++)
7282	INIT_LIST_HEAD(list: worker->temp_inactive_list + k);
7283	}
7284	}
7285
7286	return `0`;
7287	}
7288
7289	static void free_thread_groups(struct r5conf *conf)
7290	{
7291	if (conf->worker_groups)
7292	kfree(objp: conf->worker_groups[`0`].workers);
7293	kfree(objp: conf->worker_groups);
7294	conf->worker_groups = NULL;
7295	}
7296
7297	static sector_t
7298	raid5_size(struct mddev mddev, sector_t sectors, int* raid_disks)
7299	{
7300	struct r5conf *conf = mddev->private;
7301
7302	if (!sectors)
7303	sectors = mddev->dev_sectors;
7304	if (!raid_disks)
7305	/ size is defined by the smallest of previous and new size /
7306	raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7307
7308	sectors &= ~((sector_t)conf->chunk_sectors - `1`);
7309	sectors &= ~((sector_t)conf->prev_chunk_sectors - `1`);
7310	return sectors * (raid_disks - conf->max_degraded);
7311	}
7312
7313	static void free_scratch_buffer(struct r5conf conf, struct* raid5_percpu *percpu)
7314	{
7315	safe_put_page(p: percpu->spare_page);
7316	percpu->spare_page = NULL;
7317	kvfree(addr: percpu->scribble);
7318	percpu->scribble = NULL;
7319	}
7320
7321	static int alloc_scratch_buffer(struct r5conf conf, struct* raid5_percpu *percpu)
7322	{
7323	if (conf->level == `6` && !percpu->spare_page) {
7324	percpu->spare_page = alloc_page(GFP_KERNEL);
7325	if (!percpu->spare_page)
7326	return -ENOMEM;
7327	}
7328
7329	if (scribble_alloc(percpu,
7330	max(conf->raid_disks,
7331	conf->previous_raid_disks),
7332	max(conf->chunk_sectors,
7333	conf->prev_chunk_sectors)
7334	/ RAID5_STRIPE_SECTORS(conf))) {
7335	free_scratch_buffer(conf, percpu);
7336	return -ENOMEM;
7337	}
7338
7339	local_lock_init(&percpu->lock);
7340	return `0`;
7341	}
7342
7343	static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7344	{
7345	struct r5conf conf = hlist_entry_safe(node, struct* r5conf, node);
7346
7347	free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7348	return `0`;
7349	}
7350
7351	static void raid5_free_percpu(struct r5conf *conf)
7352	{
7353	if (!conf->percpu)
7354	return;
7355
7356	cpuhp_state_remove_instance(state: CPUHP_MD_RAID5_PREPARE, node: &conf->node);
7357	free_percpu(pdata: conf->percpu);
7358	}
7359
7360	static void free_conf(struct r5conf *conf)
7361	{
7362	int i;
7363
7364	log_exit(conf);
7365
7366	shrinker_free(shrinker: conf->shrinker);
7367	free_thread_groups(conf);
7368	shrink_stripes(conf);
7369	raid5_free_percpu(conf);
7370	for (i = `0`; i < conf->pool_size; i++)
7371	if (conf->disks[i].extra_page)
7372	put_page(page: conf->disks[i].extra_page);
7373	kfree(objp: conf->disks);
7374	bioset_exit(&conf->bio_split);
7375	kfree(objp: conf->stripe_hashtbl);
7376	kfree(objp: conf->pending_data);
7377	kfree(objp: conf);
7378	}
7379
7380	static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7381	{
7382	struct r5conf conf = hlist_entry_safe(node, struct* r5conf, node);
7383	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7384
7385	if (alloc_scratch_buffer(conf, percpu)) {
7386	pr_warn("%s: failed memory allocation for cpu%u\n",
7387	__func__, cpu);
7388	return -ENOMEM;
7389	}
7390	return `0`;
7391	}
7392
7393	static int raid5_alloc_percpu(struct r5conf *conf)
7394	{
7395	int err = `0`;
7396
7397	conf->percpu = alloc_percpu(struct raid5_percpu);
7398	if (!conf->percpu)
7399	return -ENOMEM;
7400
7401	err = cpuhp_state_add_instance(state: CPUHP_MD_RAID5_PREPARE, node: &conf->node);
7402	if (!err) {
7403	conf->scribble_disks = max(conf->raid_disks,
7404	conf->previous_raid_disks);
7405	conf->scribble_sectors = max(conf->chunk_sectors,
7406	conf->prev_chunk_sectors);
7407	}
7408	return err;
7409	}
7410
7411	static unsigned long raid5_cache_scan(struct shrinker *shrink,
7412	struct shrink_control *sc)
7413	{
7414	struct r5conf *conf = shrink->private_data;
7415	unsigned long ret = SHRINK_STOP;
7416
7417	if (mutex_trylock(&conf->cache_size_mutex)) {
7418	ret= `0`;
7419	while (ret < sc->nr_to_scan &&
7420	conf->max_nr_stripes > conf->min_nr_stripes) {
7421	if (drop_one_stripe(conf) == `0`) {
7422	ret = SHRINK_STOP;
7423	break;
7424	}
7425	ret++;
7426	}
7427	mutex_unlock(lock: &conf->cache_size_mutex);
7428	}
7429	return ret;
7430	}
7431
7432	static unsigned long raid5_cache_count(struct shrinker *shrink,
7433	struct shrink_control *sc)
7434	{
7435	struct r5conf *conf = shrink->private_data;
7436	int max_stripes = READ_ONCE(conf->max_nr_stripes);
7437	int min_stripes = READ_ONCE(conf->min_nr_stripes);
7438
7439	if (max_stripes < min_stripes)
7440	/ unlikely, but not impossible /
7441	return `0`;
7442	return max_stripes - min_stripes;
7443	}
7444
7445	static struct r5conf setup_conf(struct* mddev *mddev)
7446	{
7447	struct r5conf *conf;
7448	int raid_disk, memory, max_disks;
7449	struct md_rdev *rdev;
7450	struct disk_info *disk;
7451	char pers_name[`6`];
7452	int i;
7453	int group_cnt;
7454	struct r5worker_group *new_group;
7455	int ret = -ENOMEM;
7456
7457	if (mddev->new_level != `5`
7458	&& mddev->new_level != `4`
7459	&& mddev->new_level != `6`) {
7460	pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7461	mdname(mddev), mddev->new_level);
7462	return ERR_PTR(error: -EIO);
7463	}
7464	if ((mddev->new_level == `5`
7465	&& !algorithm_valid_raid5(layout: mddev->new_layout)) \|\|
7466	(mddev->new_level == `6`
7467	&& !algorithm_valid_raid6(layout: mddev->new_layout))) {
7468	pr_warn("md/raid:%s: layout %d not supported\n",
7469	mdname(mddev), mddev->new_layout);
7470	return ERR_PTR(error: -EIO);
7471	}
7472	if (mddev->new_level == `6` && mddev->raid_disks < `4`) {
7473	pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7474	mdname(mddev), mddev->raid_disks);
7475	return ERR_PTR(error: -EINVAL);
7476	}
7477
7478	if (!mddev->new_chunk_sectors \|\|
7479	(mddev->new_chunk_sectors << `9`) % PAGE_SIZE \|\|
7480	!is_power_of_2(n: mddev->new_chunk_sectors)) {
7481	pr_warn("md/raid:%s: invalid chunk size %d\n",
7482	mdname(mddev), mddev->new_chunk_sectors << `9`);
7483	return ERR_PTR(error: -EINVAL);
7484	}
7485
7486	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7487	if (conf == NULL)
7488	goto abort;
7489
7490	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7491	conf->stripe_size = DEFAULT_STRIPE_SIZE;
7492	conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - `9`;
7493	conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> `9`;
7494	#endif
7495	INIT_LIST_HEAD(list: &conf->free_list);
7496	INIT_LIST_HEAD(list: &conf->pending_list);
7497	conf->pending_data = kcalloc(PENDING_IO_MAX,
7498	sizeof(struct r5pending_data),
7499	GFP_KERNEL);
7500	if (!conf->pending_data)
7501	goto abort;
7502	for (i = `0`; i < PENDING_IO_MAX; i++)
7503	list_add(new: &conf->pending_data[i].sibling, head: &conf->free_list);
7504	/ Don't enable multi-threading by default/
7505	if (!alloc_thread_groups(conf, cnt: `0`, group_cnt: &group_cnt, worker_groups: &new_group)) {
7506	conf->group_cnt = group_cnt;
7507	conf->worker_cnt_per_group = `0`;
7508	conf->worker_groups = new_group;
7509	} else
7510	goto abort;
7511	spin_lock_init(&conf->device_lock);
7512	seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7513	mutex_init(&conf->cache_size_mutex);
7514
7515	init_waitqueue_head(&conf->wait_for_quiescent);
7516	init_waitqueue_head(&conf->wait_for_stripe);
7517	init_waitqueue_head(&conf->wait_for_reshape);
7518	INIT_LIST_HEAD(list: &conf->handle_list);
7519	INIT_LIST_HEAD(list: &conf->loprio_list);
7520	INIT_LIST_HEAD(list: &conf->hold_list);
7521	INIT_LIST_HEAD(list: &conf->delayed_list);
7522	INIT_LIST_HEAD(list: &conf->bitmap_list);
7523	init_llist_head(list: &conf->released_stripes);
7524	atomic_set(v: &conf->active_stripes, i: `0`);
7525	atomic_set(v: &conf->preread_active_stripes, i: `0`);
7526	atomic_set(v: &conf->active_aligned_reads, i: `0`);
7527	spin_lock_init(&conf->pending_bios_lock);
7528	conf->batch_bio_dispatch = true;
7529	rdev_for_each(rdev, mddev) {
7530	if (test_bit(Journal, &rdev->flags))
7531	continue;
7532	if (bdev_nonrot(bdev: rdev->bdev)) {
7533	conf->batch_bio_dispatch = false;
7534	break;
7535	}
7536	}
7537
7538	conf->bypass_threshold = BYPASS_THRESHOLD;
7539	conf->recovery_disabled = mddev->recovery_disabled - `1`;
7540
7541	conf->raid_disks = mddev->raid_disks;
7542	if (mddev->reshape_position == MaxSector)
7543	conf->previous_raid_disks = mddev->raid_disks;
7544	else
7545	conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7546	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7547
7548	conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7549	GFP_KERNEL);
7550
7551	if (!conf->disks)
7552	goto abort;
7553
7554	for (i = `0`; i < max_disks; i++) {
7555	conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7556	if (!conf->disks[i].extra_page)
7557	goto abort;
7558	}
7559
7560	ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, `0`, flags: `0`);
7561	if (ret)
7562	goto abort;
7563	conf->mddev = mddev;
7564
7565	ret = -ENOMEM;
7566	conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL);
7567	if (!conf->stripe_hashtbl)
7568	goto abort;
7569
7570	/ We init hash_locks[0] separately to that it can be used*
7571	* as the reference lock in the spin_lock_nest_lock() call
7572	* in lock_all_device_hash_locks_irq in order to convince
7573	* lockdep that we know what we are doing.
7574	*/
7575	spin_lock_init(conf->hash_locks);
7576	for (i = `1`; i < NR_STRIPE_HASH_LOCKS; i++)
7577	spin_lock_init(conf->hash_locks + i);
7578
7579	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
7580	INIT_LIST_HEAD(list: conf->inactive_list + i);
7581
7582	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
7583	INIT_LIST_HEAD(list: conf->temp_inactive_list + i);
7584
7585	atomic_set(v: &conf->r5c_cached_full_stripes, i: `0`);
7586	INIT_LIST_HEAD(list: &conf->r5c_full_stripe_list);
7587	atomic_set(v: &conf->r5c_cached_partial_stripes, i: `0`);
7588	INIT_LIST_HEAD(list: &conf->r5c_partial_stripe_list);
7589	atomic_set(v: &conf->r5c_flushing_full_stripes, i: `0`);
7590	atomic_set(v: &conf->r5c_flushing_partial_stripes, i: `0`);
7591
7592	conf->level = mddev->new_level;
7593	conf->chunk_sectors = mddev->new_chunk_sectors;
7594	ret = raid5_alloc_percpu(conf);
7595	if (ret)
7596	goto abort;
7597
7598	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7599
7600	ret = -EIO;
7601	rdev_for_each(rdev, mddev) {
7602	raid_disk = rdev->raid_disk;
7603	if (raid_disk >= max_disks
7604	\|\| raid_disk < `0` \|\| test_bit(Journal, &rdev->flags))
7605	continue;
7606	disk = conf->disks + raid_disk;
7607
7608	if (test_bit(Replacement, &rdev->flags)) {
7609	if (disk->replacement)
7610	goto abort;
7611	disk->replacement = rdev;
7612	} else {
7613	if (disk->rdev)
7614	goto abort;
7615	disk->rdev = rdev;
7616	}
7617
7618	if (test_bit(In_sync, &rdev->flags)) {
7619	pr_info("md/raid:%s: device %pg operational as raid disk %d\n",
7620	mdname(mddev), rdev->bdev, raid_disk);
7621	} else if (rdev->saved_raid_disk != raid_disk)
7622	/ Cannot rely on bitmap to complete recovery /
7623	conf->fullsync = `1`;
7624	}
7625
7626	conf->level = mddev->new_level;
7627	if (conf->level == `6`) {
7628	conf->max_degraded = `2`;
7629	if (raid6_call.xor_syndrome)
7630	conf->rmw_level = PARITY_ENABLE_RMW;
7631	else
7632	conf->rmw_level = PARITY_DISABLE_RMW;
7633	} else {
7634	conf->max_degraded = `1`;
7635	conf->rmw_level = PARITY_ENABLE_RMW;
7636	}
7637	conf->algorithm = mddev->new_layout;
7638	conf->reshape_progress = mddev->reshape_position;
7639	if (conf->reshape_progress != MaxSector) {
7640	conf->prev_chunk_sectors = mddev->chunk_sectors;
7641	conf->prev_algo = mddev->layout;
7642	} else {
7643	conf->prev_chunk_sectors = conf->chunk_sectors;
7644	conf->prev_algo = conf->algorithm;
7645	}
7646
7647	conf->min_nr_stripes = NR_STRIPES;
7648	if (mddev->reshape_position != MaxSector) {
7649	int stripes = max_t(int,
7650	((mddev->chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`,
7651	((mddev->new_chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`);
7652	conf->min_nr_stripes = max(NR_STRIPES, stripes);
7653	if (conf->min_nr_stripes != NR_STRIPES)
7654	pr_info("md/raid:%s: force stripe size %d for reshape\n",
7655	mdname(mddev), conf->min_nr_stripes);
7656	}
7657	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7658	max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / `1024`;
7659	atomic_set(v: &conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7660	if (grow_stripes(conf, num: conf->min_nr_stripes)) {
7661	pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7662	mdname(mddev), memory);
7663	ret = -ENOMEM;
7664	goto abort;
7665	} else
7666	pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7667	/*
7668	* Losing a stripe head costs more than the time to refill it,
7669	* it reduces the queue depth and so can hurt throughput.
7670	* So set it rather large, scaled by number of devices.
7671	*/
7672	conf->shrinker = shrinker_alloc(flags: `0`, fmt: "md-raid5:%s", mdname(mddev));
7673	if (!conf->shrinker) {
7674	ret = -ENOMEM;
7675	pr_warn("md/raid:%s: couldn't allocate shrinker.\n",
7676	mdname(mddev));
7677	goto abort;
7678	}
7679
7680	conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * `4`;
7681	conf->shrinker->scan_objects = raid5_cache_scan;
7682	conf->shrinker->count_objects = raid5_cache_count;
7683	conf->shrinker->batch = `128`;
7684	conf->shrinker->private_data = conf;
7685
7686	shrinker_register(shrinker: conf->shrinker);
7687
7688	sprintf(buf: pers_name, fmt: "raid%d", mddev->new_level);
7689	rcu_assign_pointer(conf->thread,
7690	md_register_thread(raid5d, mddev, pers_name));
7691	if (!conf->thread) {
7692	pr_warn("md/raid:%s: couldn't allocate thread.\n",
7693	mdname(mddev));
7694	ret = -ENOMEM;
7695	goto abort;
7696	}
7697
7698	return conf;
7699
7700	abort:
7701	if (conf)
7702	free_conf(conf);
7703	return ERR_PTR(error: ret);
7704	}
7705
7706	static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7707	{
7708	switch (algo) {
7709	case ALGORITHM_PARITY_0:
7710	if (raid_disk < max_degraded)
7711	return `1`;
7712	break;
7713	case ALGORITHM_PARITY_N:
7714	if (raid_disk >= raid_disks - max_degraded)
7715	return `1`;
7716	break;
7717	case ALGORITHM_PARITY_0_6:
7718	if (raid_disk == `0` \|\|
7719	raid_disk == raid_disks - `1`)
7720	return `1`;
7721	break;
7722	case ALGORITHM_LEFT_ASYMMETRIC_6:
7723	case ALGORITHM_RIGHT_ASYMMETRIC_6:
7724	case ALGORITHM_LEFT_SYMMETRIC_6:
7725	case ALGORITHM_RIGHT_SYMMETRIC_6:
7726	if (raid_disk == raid_disks - `1`)
7727	return `1`;
7728	}
7729	return `0`;
7730	}
7731
7732	static int raid5_set_limits(struct mddev *mddev)
7733	{
7734	struct r5conf *conf = mddev->private;
7735	struct queue_limits lim;
7736	int data_disks, stripe;
7737	struct md_rdev *rdev;
7738
7739	/*
7740	* The read-ahead size must cover two whole stripes, which is
7741	* 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
7742	*/
7743	data_disks = conf->previous_raid_disks - conf->max_degraded;
7744
7745	/*
7746	* We can only discard a whole stripe. It doesn't make sense to
7747	* discard data disk but write parity disk
7748	*/
7749	stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << `9`));
7750
7751	md_init_stacking_limits(lim: &lim);
7752	lim.logical_block_size = mddev->logical_block_size;
7753	lim.io_min = mddev->chunk_sectors << `9`;
7754	lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
7755	lim.features \|= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
7756	lim.discard_granularity = stripe;
7757	lim.max_write_zeroes_sectors = `0`;
7758	lim.max_hw_wzeroes_unmap_sectors = `0`;
7759	mddev_stack_rdev_limits(mddev, lim: &lim, flags: `0`);
7760	rdev_for_each(rdev, mddev)
7761	queue_limits_stack_bdev(t: &lim, bdev: rdev->bdev, offset: rdev->new_data_offset,
7762	pfx: mddev->gendisk->disk_name);
7763
7764	/*
7765	* Zeroing is required for discard, otherwise data could be lost.
7766	*
7767	* Consider a scenario: discard a stripe (the stripe could be
7768	* inconsistent if discard_zeroes_data is 0); write one disk of the
7769	* stripe (the stripe could be inconsistent again depending on which
7770	* disks are used to calculate parity); the disk is broken; The stripe
7771	* data of this disk is lost.
7772	*
7773	* We only allow DISCARD if the sysadmin has confirmed that only safe
7774	* devices are in use by setting a module parameter. A better idea
7775	* might be to turn DISCARD into WRITE_ZEROES requests, as that is
7776	* required to be safe.
7777	*/
7778	if (!devices_handle_discard_safely \|\|
7779	lim.max_discard_sectors < (stripe >> `9`) \|\|
7780	lim.discard_granularity < stripe)
7781	lim.max_hw_discard_sectors = `0`;
7782
7783	/*
7784	* Requests require having a bitmap for each stripe.
7785	* Limit the max sectors based on this.
7786	*/
7787	lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
7788
7789	/ No restrictions on the number of segments in the request /
7790	lim.max_segments = USHRT_MAX;
7791
7792	return queue_limits_set(q: mddev->gendisk->queue, lim: &lim);
7793	}
7794
7795	static int raid5_run(struct mddev *mddev)
7796	{
7797	struct r5conf *conf;
7798	int dirty_parity_disks = `0`;
7799	struct md_rdev *rdev;
7800	struct md_rdev *journal_dev = NULL;
7801	sector_t reshape_offset = `0`;
7802	int i;
7803	long long min_offset_diff = `0`;
7804	int first = `1`;
7805	int ret = -EIO;
7806
7807	if (mddev->resync_offset != MaxSector)
7808	pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7809	mdname(mddev));
7810
7811	rdev_for_each(rdev, mddev) {
7812	long long diff;
7813
7814	if (test_bit(Journal, &rdev->flags)) {
7815	journal_dev = rdev;
7816	continue;
7817	}
7818	if (rdev->raid_disk < `0`)
7819	continue;
7820	diff = (rdev->new_data_offset - rdev->data_offset);
7821	if (first) {
7822	min_offset_diff = diff;
7823	first = `0`;
7824	} else if (mddev->reshape_backwards &&
7825	diff < min_offset_diff)
7826	min_offset_diff = diff;
7827	else if (!mddev->reshape_backwards &&
7828	diff > min_offset_diff)
7829	min_offset_diff = diff;
7830	}
7831
7832	if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) \|\| journal_dev) &&
7833	(mddev->bitmap_info.offset \|\| mddev->bitmap_info.file)) {
7834	pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7835	mdname(mddev));
7836	return -EINVAL;
7837	}
7838
7839	if (mddev->reshape_position != MaxSector) {
7840	/ Check that we can continue the reshape.*
7841	* Difficulties arise if the stripe we would write to
7842	* next is at or after the stripe we would read from next.
7843	* For a reshape that changes the number of devices, this
7844	* is only possible for a very short time, and mdadm makes
7845	* sure that time appears to have past before assembling
7846	* the array. So we fail if that time hasn't passed.
7847	* For a reshape that keeps the number of devices the same
7848	* mdadm must be monitoring the reshape can keeping the
7849	* critical areas read-only and backed up. It will start
7850	* the array in read-only mode, so we check for that.
7851	*/
7852	sector_t here_new, here_old;
7853	int old_disks;
7854	int max_degraded = (mddev->level == `6` ? `2` : `1`);
7855	int chunk_sectors;
7856	int new_data_disks;
7857
7858	if (journal_dev) {
7859	pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7860	mdname(mddev));
7861	return -EINVAL;
7862	}
7863
7864	if (mddev->new_level != mddev->level) {
7865	pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7866	mdname(mddev));
7867	return -EINVAL;
7868	}
7869	old_disks = mddev->raid_disks - mddev->delta_disks;
7870	/ reshape_position must be on a new-stripe boundary, and one*
7871	* further up in new geometry must map after here in old
7872	* geometry.
7873	* If the chunk sizes are different, then as we perform reshape
7874	* in units of the largest of the two, reshape_position needs
7875	* be a multiple of the largest chunk size times new data disks.
7876	*/
7877	here_new = mddev->reshape_position;
7878	chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7879	new_data_disks = mddev->raid_disks - max_degraded;
7880	if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7881	pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7882	mdname(mddev));
7883	return -EINVAL;
7884	}
7885	reshape_offset = here_new * chunk_sectors;
7886	/ here_new is the stripe we will write to /
7887	here_old = mddev->reshape_position;
7888	sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7889	/ here_old is the first stripe that we might need to read*
7890	* from */
7891	if (mddev->delta_disks == `0`) {
7892	/ We cannot be sure it is safe to start an in-place*
7893	* reshape. It is only safe if user-space is monitoring
7894	* and taking constant backups.
7895	* mdadm always starts a situation like this in
7896	* readonly mode so it can take control before
7897	* allowing any writes. So just check for that.
7898	*/
7899	if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7900	abs(min_offset_diff) >= mddev->new_chunk_sectors)
7901	/ not really in-place - so OK /;
7902	else if (mddev->ro == `0`) {
7903	pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7904	mdname(mddev));
7905	return -EINVAL;
7906	}
7907	} else if (mddev->reshape_backwards
7908	? (here_new * chunk_sectors + min_offset_diff <=
7909	here_old * chunk_sectors)
7910	: (here_new * chunk_sectors >=
7911	here_old * chunk_sectors + (-min_offset_diff))) {
7912	/ Reading from the same stripe as writing to - bad /
7913	pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7914	mdname(mddev));
7915	return -EINVAL;
7916	}
7917	pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7918	/ OK, we should be able to continue; /
7919	} else {
7920	BUG_ON(mddev->level != mddev->new_level);
7921	BUG_ON(mddev->layout != mddev->new_layout);
7922	BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7923	BUG_ON(mddev->delta_disks != `0`);
7924	}
7925
7926	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7927	test_bit(MD_HAS_PPL, &mddev->flags)) {
7928	pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7929	mdname(mddev));
7930	clear_bit(nr: MD_HAS_PPL, addr: &mddev->flags);
7931	clear_bit(nr: MD_HAS_MULTIPLE_PPLS, addr: &mddev->flags);
7932	}
7933
7934	if (mddev->private == NULL)
7935	conf = setup_conf(mddev);
7936	else
7937	conf = mddev->private;
7938
7939	if (IS_ERR(ptr: conf))
7940	return PTR_ERR(ptr: conf);
7941
7942	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7943	if (!journal_dev) {
7944	pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7945	mdname(mddev));
7946	mddev->ro = `1`;
7947	set_disk_ro(disk: mddev->gendisk, read_only: `1`);
7948	} else if (mddev->resync_offset == MaxSector)
7949	set_bit(nr: MD_JOURNAL_CLEAN, addr: &mddev->flags);
7950	}
7951
7952	conf->min_offset_diff = min_offset_diff;
7953	rcu_assign_pointer(mddev->thread, conf->thread);
7954	rcu_assign_pointer(conf->thread, NULL);
7955	mddev->private = conf;
7956
7957	for (i = `0`; i < conf->raid_disks && conf->previous_raid_disks;
7958	i++) {
7959	rdev = conf->disks[i].rdev;
7960	if (!rdev)
7961	continue;
7962	if (conf->disks[i].replacement &&
7963	conf->reshape_progress != MaxSector) {
7964	/ replacements and reshape simply do not mix. /
7965	pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7966	goto abort;
7967	}
7968	if (test_bit(In_sync, &rdev->flags))
7969	continue;
7970	/ This disc is not fully in-sync. However if it*
7971	* just stored parity (beyond the recovery_offset),
7972	* when we don't need to be concerned about the
7973	* array being dirty.
7974	* When reshape goes 'backwards', we never have
7975	* partially completed devices, so we only need
7976	* to worry about reshape going forwards.
7977	*/
7978	/ Hack because v0.91 doesn't store recovery_offset properly. /
7979	if (mddev->major_version == `0` &&
7980	mddev->minor_version > `90`)
7981	rdev->recovery_offset = reshape_offset;
7982
7983	if (rdev->recovery_offset < reshape_offset) {
7984	/ We need to check old and new layout /
7985	if (!only_parity(raid_disk: rdev->raid_disk,
7986	algo: conf->algorithm,
7987	raid_disks: conf->raid_disks,
7988	max_degraded: conf->max_degraded))
7989	continue;
7990	}
7991	if (!only_parity(raid_disk: rdev->raid_disk,
7992	algo: conf->prev_algo,
7993	raid_disks: conf->previous_raid_disks,
7994	max_degraded: conf->max_degraded))
7995	continue;
7996	dirty_parity_disks++;
7997	}
7998
7999	/*
8000	* 0 for a fully functional array, 1 or 2 for a degraded array.
8001	*/
8002	mddev->degraded = raid5_calc_degraded(conf);
8003
8004	if (has_failed(conf)) {
8005	pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
8006	mdname(mddev), mddev->degraded, conf->raid_disks);
8007	goto abort;
8008	}
8009
8010	/ device size must be a multiple of chunk size /
8011	mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - `1`);
8012	mddev->resync_max_sectors = mddev->dev_sectors;
8013
8014	if (mddev->degraded > dirty_parity_disks &&
8015	mddev->resync_offset != MaxSector) {
8016	if (test_bit(MD_HAS_PPL, &mddev->flags))
8017	pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
8018	mdname(mddev));
8019	else if (mddev->ok_start_degraded)
8020	pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
8021	mdname(mddev));
8022	else {
8023	pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
8024	mdname(mddev));
8025	goto abort;
8026	}
8027	}
8028
8029	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
8030	mdname(mddev), conf->level,
8031	mddev->raid_disks-mddev->degraded, mddev->raid_disks,
8032	mddev->new_layout);
8033
8034	print_raid5_conf(conf);
8035
8036	if (conf->reshape_progress != MaxSector) {
8037	conf->reshape_safe = conf->reshape_progress;
8038	atomic_set(v: &conf->reshape_stripes, i: `0`);
8039	clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery);
8040	clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery);
8041	set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery);
8042	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
8043	}
8044
8045	/ Ok, everything is just fine now /
8046	if (mddev->to_remove == &raid5_attrs_group)
8047	mddev->to_remove = NULL;
8048	else if (mddev->kobj.sd &&
8049	sysfs_create_group(kobj: &mddev->kobj, grp: &raid5_attrs_group))
8050	pr_warn("raid5: failed to create sysfs attributes for %s\n",
8051	mdname(mddev));
8052	md_set_array_sectors(mddev, array_sectors: raid5_size(mddev, sectors: `0`, raid_disks: `0`));
8053
8054	if (!mddev_is_dm(mddev)) {
8055	ret = raid5_set_limits(mddev);
8056	if (ret)
8057	goto abort;
8058	}
8059
8060	if (log_init(conf, journal_dev, ppl: raid5_has_ppl(conf)))
8061	goto abort;
8062
8063	return `0`;
8064	abort:
8065	md_unregister_thread(mddev, threadp: &mddev->thread);
8066	print_raid5_conf(conf);
8067	free_conf(conf);
8068	mddev->private = NULL;
8069	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
8070	return ret;
8071	}
8072
8073	static void raid5_free(struct mddev mddev, void* *priv)
8074	{
8075	struct r5conf *conf = priv;
8076
8077	free_conf(conf);
8078	mddev->to_remove = &raid5_attrs_group;
8079	}
8080
8081	static void raid5_status(struct seq_file seq, struct* mddev *mddev)
8082	{
8083	struct r5conf *conf = mddev->private;
8084	int i;
8085
8086	lockdep_assert_held(&mddev->lock);
8087
8088	seq_printf(m: seq, fmt: " level %d, %dk chunk, algorithm %d", mddev->level,
8089	conf->chunk_sectors / `2`, mddev->layout);
8090	seq_printf (m: seq, fmt: " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
8091	for (i = `0`; i < conf->raid_disks; i++) {
8092	struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
8093
8094	seq_printf (m: seq, fmt: "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
8095	}
8096	seq_printf (m: seq, fmt: "]");
8097	}
8098
8099	static void print_raid5_conf(struct r5conf *conf)
8100	{
8101	struct md_rdev *rdev;
8102	int i;
8103
8104	pr_debug("RAID conf printout:\n");
8105	if (!conf) {
8106	pr_debug("(conf==NULL)\n");
8107	return;
8108	}
8109	pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
8110	conf->raid_disks,
8111	conf->raid_disks - conf->mddev->degraded);
8112
8113	for (i = `0`; i < conf->raid_disks; i++) {
8114	rdev = conf->disks[i].rdev;
8115	if (rdev)
8116	pr_debug(" disk %d, o:%d, dev:%pg\n",
8117	i, !test_bit(Faulty, &rdev->flags),
8118	rdev->bdev);
8119	}
8120	}
8121
8122	static int raid5_spare_active(struct mddev *mddev)
8123	{
8124	int i;
8125	struct r5conf *conf = mddev->private;
8126	struct md_rdev rdev, replacement;
8127	int count = `0`;
8128	unsigned long flags;
8129
8130	for (i = `0`; i < conf->raid_disks; i++) {
8131	rdev = conf->disks[i].rdev;
8132	replacement = conf->disks[i].replacement;
8133	if (replacement
8134	&& replacement->recovery_offset == MaxSector
8135	&& !test_bit(Faulty, &replacement->flags)
8136	&& !test_and_set_bit(nr: In_sync, addr: &replacement->flags)) {
8137	/ Replacement has just become active. /
8138	if (!rdev
8139	\|\| !test_and_clear_bit(nr: In_sync, addr: &rdev->flags))
8140	count++;
8141	if (rdev) {
8142	/ Replaced device not technically faulty,*
8143	* but we need to be sure it gets removed
8144	* and never re-added.
8145	*/
8146	set_bit(nr: Faulty, addr: &rdev->flags);
8147	sysfs_notify_dirent_safe(
8148	sd: rdev->sysfs_state);
8149	}
8150	sysfs_notify_dirent_safe(sd: replacement->sysfs_state);
8151	} else if (rdev
8152	&& rdev->recovery_offset == MaxSector
8153	&& !test_bit(Faulty, &rdev->flags)
8154	&& !test_and_set_bit(nr: In_sync, addr: &rdev->flags)) {
8155	count++;
8156	sysfs_notify_dirent_safe(sd: rdev->sysfs_state);
8157	}
8158	}
8159	spin_lock_irqsave(&conf->device_lock, flags);
8160	mddev->degraded = raid5_calc_degraded(conf);
8161	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
8162	print_raid5_conf(conf);
8163	return count;
8164	}
8165
8166	static int raid5_remove_disk(struct mddev mddev, struct* md_rdev *rdev)
8167	{
8168	struct r5conf *conf = mddev->private;
8169	int err = `0`;
8170	int number = rdev->raid_disk;
8171	struct md_rdev **rdevp;
8172	struct disk_info *p;
8173	struct md_rdev *tmp;
8174
8175	print_raid5_conf(conf);
8176	if (test_bit(Journal, &rdev->flags) && conf->log) {
8177	/*
8178	* we can't wait pending write here, as this is called in
8179	* raid5d, wait will deadlock.
8180	* neilb: there is no locking about new writes here,
8181	* so this cannot be safe.
8182	*/
8183	if (atomic_read(v: &conf->active_stripes) \|\|
8184	atomic_read(v: &conf->r5c_cached_full_stripes) \|\|
8185	atomic_read(v: &conf->r5c_cached_partial_stripes)) {
8186	return -EBUSY;
8187	}
8188	log_exit(conf);
8189	return `0`;
8190	}
8191	if (unlikely(number >= conf->pool_size))
8192	return `0`;
8193	p = conf->disks + number;
8194	if (rdev == p->rdev)
8195	rdevp = &p->rdev;
8196	else if (rdev == p->replacement)
8197	rdevp = &p->replacement;
8198	else
8199	return `0`;
8200
8201	if (number >= conf->raid_disks &&
8202	conf->reshape_progress == MaxSector)
8203	clear_bit(nr: In_sync, addr: &rdev->flags);
8204
8205	if (test_bit(In_sync, &rdev->flags) \|\|
8206	atomic_read(v: &rdev->nr_pending)) {
8207	err = -EBUSY;
8208	goto abort;
8209	}
8210	/ Only remove non-faulty devices if recovery*
8211	* isn't possible.
8212	*/
8213	if (!test_bit(Faulty, &rdev->flags) &&
8214	mddev->recovery_disabled != conf->recovery_disabled &&
8215	!has_failed(conf) &&
8216	(!p->replacement \|\| p->replacement == rdev) &&
8217	number < conf->raid_disks) {
8218	err = -EBUSY;
8219	goto abort;
8220	}
8221	WRITE_ONCE(*rdevp, NULL);
8222	if (!err) {
8223	err = log_modify(conf, rdev, add: false);
8224	if (err)
8225	goto abort;
8226	}
8227
8228	tmp = p->replacement;
8229	if (tmp) {
8230	/ We must have just cleared 'rdev' /
8231	WRITE_ONCE(p->rdev, tmp);
8232	clear_bit(nr: Replacement, addr: &tmp->flags);
8233	WRITE_ONCE(p->replacement, NULL);
8234
8235	if (!err)
8236	err = log_modify(conf, rdev: tmp, add: true);
8237	}
8238
8239	clear_bit(nr: WantReplacement, addr: &rdev->flags);
8240	abort:
8241
8242	print_raid5_conf(conf);
8243	return err;
8244	}
8245
8246	static int raid5_add_disk(struct mddev mddev, struct* md_rdev *rdev)
8247	{
8248	struct r5conf *conf = mddev->private;
8249	int ret, err = -EEXIST;
8250	int disk;
8251	struct disk_info *p;
8252	struct md_rdev *tmp;
8253	int first = `0`;
8254	int last = conf->raid_disks - `1`;
8255
8256	if (test_bit(Journal, &rdev->flags)) {
8257	if (conf->log)
8258	return -EBUSY;
8259
8260	rdev->raid_disk = `0`;
8261	/*
8262	* The array is in readonly mode if journal is missing, so no
8263	* write requests running. We should be safe
8264	*/
8265	ret = log_init(conf, journal_dev: rdev, ppl: false);
8266	if (ret)
8267	return ret;
8268
8269	ret = r5l_start(log: conf->log);
8270	if (ret)
8271	return ret;
8272
8273	return `0`;
8274	}
8275	if (mddev->recovery_disabled == conf->recovery_disabled)
8276	return -EBUSY;
8277
8278	if (rdev->saved_raid_disk < `0` && has_failed(conf))
8279	/ no point adding a device /
8280	return -EINVAL;
8281
8282	if (rdev->raid_disk >= `0`)
8283	first = last = rdev->raid_disk;
8284
8285	/*
8286	* find the disk ... but prefer rdev->saved_raid_disk
8287	* if possible.
8288	*/
8289	if (rdev->saved_raid_disk >= first &&
8290	rdev->saved_raid_disk <= last &&
8291	conf->disks[rdev->saved_raid_disk].rdev == NULL)
8292	first = rdev->saved_raid_disk;
8293
8294	for (disk = first; disk <= last; disk++) {
8295	p = conf->disks + disk;
8296	if (p->rdev == NULL) {
8297	clear_bit(nr: In_sync, addr: &rdev->flags);
8298	rdev->raid_disk = disk;
8299	if (rdev->saved_raid_disk != disk)
8300	conf->fullsync = `1`;
8301	WRITE_ONCE(p->rdev, rdev);
8302
8303	err = log_modify(conf, rdev, add: true);
8304
8305	goto out;
8306	}
8307	}
8308	for (disk = first; disk <= last; disk++) {
8309	p = conf->disks + disk;
8310	tmp = p->rdev;
8311	if (test_bit(WantReplacement, &tmp->flags) &&
8312	mddev->reshape_position == MaxSector &&
8313	p->replacement == NULL) {
8314	clear_bit(nr: In_sync, addr: &rdev->flags);
8315	set_bit(nr: Replacement, addr: &rdev->flags);
8316	rdev->raid_disk = disk;
8317	err = `0`;
8318	conf->fullsync = `1`;
8319	WRITE_ONCE(p->replacement, rdev);
8320	break;
8321	}
8322	}
8323	out:
8324	print_raid5_conf(conf);
8325	return err;
8326	}
8327
8328	static int raid5_resize(struct mddev *mddev, sector_t sectors)
8329	{
8330	/ no resync is happening, and there is enough space*
8331	* on all devices, so we can resize.
8332	* We need to make sure resync covers any new space.
8333	* If the array is shrinking we should possibly wait until
8334	* any io in the removed space completes, but it hardly seems
8335	* worth it.
8336	*/
8337	sector_t newsize;
8338	struct r5conf *conf = mddev->private;
8339
8340	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
8341	return -EINVAL;
8342	sectors &= ~((sector_t)conf->chunk_sectors - `1`);
8343	newsize = raid5_size(mddev, sectors, raid_disks: mddev->raid_disks);
8344	if (mddev->external_size &&
8345	mddev->array_sectors > newsize)
8346	return -EINVAL;
8347
8348	if (md_bitmap_enabled(mddev, flush: false)) {
8349	int ret = mddev->bitmap_ops->resize(mddev, sectors, `0`);
8350
8351	if (ret)
8352	return ret;
8353	}
8354
8355	md_set_array_sectors(mddev, array_sectors: newsize);
8356	if (sectors > mddev->dev_sectors &&
8357	mddev->resync_offset > mddev->dev_sectors) {
8358	mddev->resync_offset = mddev->dev_sectors;
8359	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
8360	}
8361	mddev->dev_sectors = sectors;
8362	mddev->resync_max_sectors = sectors;
8363	return `0`;
8364	}
8365
8366	static int check_stripe_cache(struct mddev *mddev)
8367	{
8368	/ Can only proceed if there are plenty of stripe_heads.*
8369	* We need a minimum of one full stripe,, and for sensible progress
8370	* it is best to have about 4 times that.
8371	* If we require 4 times, then the default 256 4K stripe_heads will
8372	* allow for chunk sizes up to 256K, which is probably OK.
8373	* If the chunk size is greater, user-space should request more
8374	* stripe_heads first.
8375	*/
8376	struct r5conf *conf = mddev->private;
8377	if (((mddev->chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`
8378	> conf->min_nr_stripes \|\|
8379	((mddev->new_chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`
8380	> conf->min_nr_stripes) {
8381	pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
8382	mdname(mddev),
8383	((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << `9`)
8384	/ RAID5_STRIPE_SIZE(conf))*`4`);
8385	return `0`;
8386	}
8387	return `1`;
8388	}
8389
8390	static int check_reshape(struct mddev *mddev)
8391	{
8392	struct r5conf *conf = mddev->private;
8393
8394	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
8395	return -EINVAL;
8396	if (mddev->delta_disks == `0` &&
8397	mddev->new_layout == mddev->layout &&
8398	mddev->new_chunk_sectors == mddev->chunk_sectors)
8399	return `0`; / nothing to do /
8400	if (has_failed(conf))
8401	return -EINVAL;
8402	if (mddev->delta_disks < `0` && mddev->reshape_position == MaxSector) {
8403	/ We might be able to shrink, but the devices must*
8404	* be made bigger first.
8405	* For raid6, 4 is the minimum size.
8406	* Otherwise 2 is the minimum
8407	*/
8408	int min = `2`;
8409	if (mddev->level == `6`)
8410	min = `4`;
8411	if (mddev->raid_disks + mddev->delta_disks < min)
8412	return -EINVAL;
8413	}
8414
8415	if (!check_stripe_cache(mddev))
8416	return -ENOSPC;
8417
8418	if (mddev->new_chunk_sectors > mddev->chunk_sectors \|\|
8419	mddev->delta_disks > `0`)
8420	if (resize_chunks(conf,
8421	new_disks: conf->previous_raid_disks
8422	+ max(`0`, mddev->delta_disks),
8423	max(mddev->new_chunk_sectors,
8424	mddev->chunk_sectors)
8425	) < `0`)
8426	return -ENOMEM;
8427
8428	if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8429	return `0`; / never bother to shrink /
8430	return resize_stripes(conf, newsize: (conf->previous_raid_disks
8431	+ mddev->delta_disks));
8432	}
8433
8434	static int raid5_start_reshape(struct mddev *mddev)
8435	{
8436	struct r5conf *conf = mddev->private;
8437	struct md_rdev *rdev;
8438	int spares = `0`;
8439	int i;
8440	unsigned long flags;
8441
8442	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8443	return -EBUSY;
8444
8445	if (!check_stripe_cache(mddev))
8446	return -ENOSPC;
8447
8448	if (has_failed(conf))
8449	return -EINVAL;
8450
8451	/ raid5 can't handle concurrent reshape and recovery /
8452	if (mddev->resync_offset < MaxSector)
8453	return -EBUSY;
8454	for (i = `0`; i < conf->raid_disks; i++)
8455	if (conf->disks[i].replacement)
8456	return -EBUSY;
8457
8458	rdev_for_each(rdev, mddev) {
8459	if (!test_bit(In_sync, &rdev->flags)
8460	&& !test_bit(Faulty, &rdev->flags))
8461	spares++;
8462	}
8463
8464	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8465	/ Not enough devices even to make a degraded array*
8466	* of that size
8467	*/
8468	return -EINVAL;
8469
8470	/ Refuse to reduce size of the array. Any reductions in*
8471	* array size must be through explicit setting of array_size
8472	* attribute.
8473	*/
8474	if (raid5_size(mddev, sectors: `0`, raid_disks: conf->raid_disks + mddev->delta_disks)
8475	< mddev->array_sectors) {
8476	pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8477	mdname(mddev));
8478	return -EINVAL;
8479	}
8480
8481	atomic_set(v: &conf->reshape_stripes, i: `0`);
8482	spin_lock_irq(lock: &conf->device_lock);
8483	write_seqcount_begin(&conf->gen_lock);
8484	conf->previous_raid_disks = conf->raid_disks;
8485	conf->raid_disks += mddev->delta_disks;
8486	conf->prev_chunk_sectors = conf->chunk_sectors;
8487	conf->chunk_sectors = mddev->new_chunk_sectors;
8488	conf->prev_algo = conf->algorithm;
8489	conf->algorithm = mddev->new_layout;
8490	conf->generation++;
8491	/ Code that selects data_offset needs to see the generation update*
8492	* if reshape_progress has been set - so a memory barrier needed.
8493	*/
8494	smp_mb();
8495	if (mddev->reshape_backwards)
8496	conf->reshape_progress = raid5_size(mddev, sectors: `0`, raid_disks: `0`);
8497	else
8498	conf->reshape_progress = `0`;
8499	conf->reshape_safe = conf->reshape_progress;
8500	write_seqcount_end(&conf->gen_lock);
8501	spin_unlock_irq(lock: &conf->device_lock);
8502
8503	/ Now make sure any requests that proceeded on the assumption*
8504	* the reshape wasn't running - like Discard or Read - have
8505	* completed.
8506	*/
8507	raid5_quiesce(mddev, quiesce: true);
8508	raid5_quiesce(mddev, quiesce: false);
8509
8510	/ Add some new drives, as many as will fit.*
8511	* We know there are enough to make the newly sized array work.
8512	* Don't add devices if we are reducing the number of
8513	* devices in the array. This is because it is not possible
8514	* to correctly record the "partially reconstructed" state of
8515	* such devices during the reshape and confusion could result.
8516	*/
8517	if (mddev->delta_disks >= `0`) {
8518	rdev_for_each(rdev, mddev)
8519	if (rdev->raid_disk < `0` &&
8520	!test_bit(Faulty, &rdev->flags)) {
8521	if (raid5_add_disk(mddev, rdev) == `0`) {
8522	if (rdev->raid_disk
8523	>= conf->previous_raid_disks)
8524	set_bit(nr: In_sync, addr: &rdev->flags);
8525	else
8526	rdev->recovery_offset = `0`;
8527
8528	/ Failure here is OK /
8529	sysfs_link_rdev(mddev, rdev);
8530	}
8531	} else if (rdev->raid_disk >= conf->previous_raid_disks
8532	&& !test_bit(Faulty, &rdev->flags)) {
8533	/ This is a spare that was manually added /
8534	set_bit(nr: In_sync, addr: &rdev->flags);
8535	}
8536
8537	/ When a reshape changes the number of devices,*
8538	* ->degraded is measured against the larger of the
8539	* pre and post number of devices.
8540	*/
8541	spin_lock_irqsave(&conf->device_lock, flags);
8542	mddev->degraded = raid5_calc_degraded(conf);
8543	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
8544	}
8545	mddev->raid_disks = conf->raid_disks;
8546	mddev->reshape_position = conf->reshape_progress;
8547	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
8548
8549	clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery);
8550	clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery);
8551	clear_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery);
8552	set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery);
8553	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
8554	conf->reshape_checkpoint = jiffies;
8555	md_new_event();
8556	return `0`;
8557	}
8558
8559	/ This is called from the reshape thread and should make any*
8560	* changes needed in 'conf'
8561	*/
8562	static void end_reshape(struct r5conf *conf)
8563	{
8564
8565	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8566	struct md_rdev *rdev;
8567
8568	spin_lock_irq(lock: &conf->device_lock);
8569	conf->previous_raid_disks = conf->raid_disks;
8570	md_finish_reshape(mddev: conf->mddev);
8571	smp_wmb();
8572	conf->reshape_progress = MaxSector;
8573	conf->mddev->reshape_position = MaxSector;
8574	rdev_for_each(rdev, conf->mddev)
8575	if (rdev->raid_disk >= `0` &&
8576	!test_bit(Journal, &rdev->flags) &&
8577	!test_bit(In_sync, &rdev->flags))
8578	rdev->recovery_offset = MaxSector;
8579	spin_unlock_irq(lock: &conf->device_lock);
8580	wake_up(&conf->wait_for_reshape);
8581
8582	mddev_update_io_opt(mddev: conf->mddev,
8583	nr_stripes: conf->raid_disks - conf->max_degraded);
8584	}
8585	}
8586
8587	/ This is called from the raid5d thread with mddev_lock held.*
8588	* It makes config changes to the device.
8589	*/
8590	static void raid5_finish_reshape(struct mddev *mddev)
8591	{
8592	struct r5conf *conf = mddev->private;
8593	struct md_rdev *rdev;
8594
8595	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8596
8597	if (mddev->delta_disks <= `0`) {
8598	int d;
8599	spin_lock_irq(lock: &conf->device_lock);
8600	mddev->degraded = raid5_calc_degraded(conf);
8601	spin_unlock_irq(lock: &conf->device_lock);
8602	for (d = conf->raid_disks ;
8603	d < conf->raid_disks - mddev->delta_disks;
8604	d++) {
8605	rdev = conf->disks[d].rdev;
8606	if (rdev)
8607	clear_bit(nr: In_sync, addr: &rdev->flags);
8608	rdev = conf->disks[d].replacement;
8609	if (rdev)
8610	clear_bit(nr: In_sync, addr: &rdev->flags);
8611	}
8612	}
8613	mddev->layout = conf->algorithm;
8614	mddev->chunk_sectors = conf->chunk_sectors;
8615	mddev->reshape_position = MaxSector;
8616	mddev->delta_disks = `0`;
8617	mddev->reshape_backwards = `0`;
8618	}
8619	}
8620
8621	static void raid5_quiesce(struct mddev mddev, int* quiesce)
8622	{
8623	struct r5conf *conf = mddev->private;
8624
8625	if (quiesce) {
8626	/ stop all writes /
8627	lock_all_device_hash_locks_irq(conf);
8628	/ '2' tells resync/reshape to pause so that all*
8629	* active stripes can drain
8630	*/
8631	r5c_flush_cache(conf, INT_MAX);
8632	/ need a memory barrier to make sure read_one_chunk() sees*
8633	* quiesce started and reverts to slow (locked) path.
8634	*/
8635	smp_store_release(&conf->quiesce, `2`);
8636	wait_event_cmd(conf->wait_for_quiescent,
8637	atomic_read(&conf->active_stripes) == `0` &&
8638	atomic_read(&conf->active_aligned_reads) == `0`,
8639	unlock_all_device_hash_locks_irq(conf),
8640	lock_all_device_hash_locks_irq(conf));
8641	conf->quiesce = `1`;
8642	unlock_all_device_hash_locks_irq(conf);
8643	/ allow reshape to continue /
8644	wake_up(&conf->wait_for_reshape);
8645	} else {
8646	/ re-enable writes /
8647	lock_all_device_hash_locks_irq(conf);
8648	conf->quiesce = `0`;
8649	wake_up(&conf->wait_for_quiescent);
8650	wake_up(&conf->wait_for_reshape);
8651	unlock_all_device_hash_locks_irq(conf);
8652	}
8653	log_quiesce(conf, quiesce);
8654	}
8655
8656	static void raid45_takeover_raid0(struct* mddev mddev, int* level)
8657	{
8658	struct r0conf *raid0_conf = mddev->private;
8659	sector_t sectors;
8660
8661	/ for raid0 takeover only one zone is supported /
8662	if (raid0_conf->nr_strip_zones > `1`) {
8663	pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8664	mdname(mddev));
8665	return ERR_PTR(error: -EINVAL);
8666	}
8667
8668	sectors = raid0_conf->strip_zone[`0`].zone_end;
8669	sector_div(sectors, raid0_conf->strip_zone[`0`].nb_dev);
8670	mddev->dev_sectors = sectors;
8671	mddev->new_level = level;
8672	mddev->new_layout = ALGORITHM_PARITY_N;
8673	mddev->new_chunk_sectors = mddev->chunk_sectors;
8674	mddev->raid_disks += `1`;
8675	mddev->delta_disks = `1`;
8676	/ make sure it will be not marked as dirty /
8677	mddev->resync_offset = MaxSector;
8678
8679	return setup_conf(mddev);
8680	}
8681
8682	static void raid5_takeover_raid1(struct* mddev *mddev)
8683	{
8684	int chunksect;
8685	void *ret;
8686
8687	if (mddev->raid_disks != `2` \|\|
8688	mddev->degraded > `1`)
8689	return ERR_PTR(error: -EINVAL);
8690
8691	/ Should check if there are write-behind devices? /
8692
8693	chunksect = `64``2`; /* 64K by default /
8694
8695	/ The array must be an exact multiple of chunksize /
8696	while (chunksect && (mddev->array_sectors & (chunksect-`1`)))
8697	chunksect >>= `1`;
8698
8699	if ((chunksect<<`9`) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8700	/ array size does not allow a suitable chunk size /
8701	return ERR_PTR(error: -EINVAL);
8702
8703	mddev->new_level = `5`;
8704	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8705	mddev->new_chunk_sectors = chunksect;
8706
8707	ret = setup_conf(mddev);
8708	if (!IS_ERR(ptr: ret))
8709	mddev_clear_unsupported_flags(mddev,
8710	UNSUPPORTED_MDDEV_FLAGS);
8711	return ret;
8712	}
8713
8714	static void raid5_takeover_raid6(struct* mddev *mddev)
8715	{
8716	int new_layout;
8717
8718	switch (mddev->layout) {
8719	case ALGORITHM_LEFT_ASYMMETRIC_6:
8720	new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8721	break;
8722	case ALGORITHM_RIGHT_ASYMMETRIC_6:
8723	new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8724	break;
8725	case ALGORITHM_LEFT_SYMMETRIC_6:
8726	new_layout = ALGORITHM_LEFT_SYMMETRIC;
8727	break;
8728	case ALGORITHM_RIGHT_SYMMETRIC_6:
8729	new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8730	break;
8731	case ALGORITHM_PARITY_0_6:
8732	new_layout = ALGORITHM_PARITY_0;
8733	break;
8734	case ALGORITHM_PARITY_N:
8735	new_layout = ALGORITHM_PARITY_N;
8736	break;
8737	default:
8738	return ERR_PTR(error: -EINVAL);
8739	}
8740	mddev->new_level = `5`;
8741	mddev->new_layout = new_layout;
8742	mddev->delta_disks = -`1`;
8743	mddev->raid_disks -= `1`;
8744	return setup_conf(mddev);
8745	}
8746
8747	static int raid5_check_reshape(struct mddev *mddev)
8748	{
8749	/ For a 2-drive array, the layout and chunk size can be changed*
8750	* immediately as not restriping is needed.
8751	* For larger arrays we record the new value - after validation
8752	* to be used by a reshape pass.
8753	*/
8754	struct r5conf *conf = mddev->private;
8755	int new_chunk = mddev->new_chunk_sectors;
8756
8757	if (mddev->new_layout >= `0` && !algorithm_valid_raid5(layout: mddev->new_layout))
8758	return -EINVAL;
8759	if (new_chunk > `0`) {
8760	if (!is_power_of_2(n: new_chunk))
8761	return -EINVAL;
8762	if (new_chunk < (PAGE_SIZE>>`9`))
8763	return -EINVAL;
8764	if (mddev->array_sectors & (new_chunk-`1`))
8765	/ not factor of array size /
8766	return -EINVAL;
8767	}
8768
8769	/ They look valid /
8770
8771	if (mddev->raid_disks == `2`) {
8772	/ can make the change immediately /
8773	if (mddev->new_layout >= `0`) {
8774	conf->algorithm = mddev->new_layout;
8775	mddev->layout = mddev->new_layout;
8776	}
8777	if (new_chunk > `0`) {
8778	conf->chunk_sectors = new_chunk ;
8779	mddev->chunk_sectors = new_chunk;
8780	}
8781	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
8782	md_wakeup_thread(mddev->thread);
8783	}
8784	return check_reshape(mddev);
8785	}
8786
8787	static int raid6_check_reshape(struct mddev *mddev)
8788	{
8789	int new_chunk = mddev->new_chunk_sectors;
8790
8791	if (mddev->new_layout >= `0` && !algorithm_valid_raid6(layout: mddev->new_layout))
8792	return -EINVAL;
8793	if (new_chunk > `0`) {
8794	if (!is_power_of_2(n: new_chunk))
8795	return -EINVAL;
8796	if (new_chunk < (PAGE_SIZE >> `9`))
8797	return -EINVAL;
8798	if (mddev->array_sectors & (new_chunk-`1`))
8799	/ not factor of array size /
8800	return -EINVAL;
8801	}
8802
8803	/ They look valid /
8804	return check_reshape(mddev);
8805	}
8806
8807	static void raid5_takeover(struct* mddev *mddev)
8808	{
8809	/ raid5 can take over:*
8810	* raid0 - if there is only one strip zone - make it a raid4 layout
8811	* raid1 - if there are two drives. We need to know the chunk size
8812	* raid4 - trivial - just use a raid4 layout.
8813	* raid6 - Providing it is a *_6 layout
8814	*/
8815	if (mddev->level == `0`)
8816	return raid45_takeover_raid0(mddev, level: `5`);
8817	if (mddev->level == `1`)
8818	return raid5_takeover_raid1(mddev);
8819	if (mddev->level == `4`) {
8820	mddev->new_layout = ALGORITHM_PARITY_N;
8821	mddev->new_level = `5`;
8822	return setup_conf(mddev);
8823	}
8824	if (mddev->level == `6`)
8825	return raid5_takeover_raid6(mddev);
8826
8827	return ERR_PTR(error: -EINVAL);
8828	}
8829
8830	static void raid4_takeover(struct* mddev *mddev)
8831	{
8832	/ raid4 can take over:*
8833	* raid0 - if there is only one strip zone
8834	* raid5 - if layout is right
8835	*/
8836	if (mddev->level == `0`)
8837	return raid45_takeover_raid0(mddev, level: `4`);
8838	if (mddev->level == `5` &&
8839	mddev->layout == ALGORITHM_PARITY_N) {
8840	mddev->new_layout = `0`;
8841	mddev->new_level = `4`;
8842	return setup_conf(mddev);
8843	}
8844	return ERR_PTR(error: -EINVAL);
8845	}
8846
8847	static struct md_personality raid5_personality;
8848
8849	static void raid6_takeover(struct* mddev *mddev)
8850	{
8851	/ Currently can only take over a raid5. We map the*
8852	* personality to an equivalent raid6 personality
8853	* with the Q block at the end.
8854	*/
8855	int new_layout;
8856
8857	if (mddev->pers != &raid5_personality)
8858	return ERR_PTR(error: -EINVAL);
8859	if (mddev->degraded > `1`)
8860	return ERR_PTR(error: -EINVAL);
8861	if (mddev->raid_disks > `253`)
8862	return ERR_PTR(error: -EINVAL);
8863	if (mddev->raid_disks < `3`)
8864	return ERR_PTR(error: -EINVAL);
8865
8866	switch (mddev->layout) {
8867	case ALGORITHM_LEFT_ASYMMETRIC:
8868	new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8869	break;
8870	case ALGORITHM_RIGHT_ASYMMETRIC:
8871	new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8872	break;
8873	case ALGORITHM_LEFT_SYMMETRIC:
8874	new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8875	break;
8876	case ALGORITHM_RIGHT_SYMMETRIC:
8877	new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8878	break;
8879	case ALGORITHM_PARITY_0:
8880	new_layout = ALGORITHM_PARITY_0_6;
8881	break;
8882	case ALGORITHM_PARITY_N:
8883	new_layout = ALGORITHM_PARITY_N;
8884	break;
8885	default:
8886	return ERR_PTR(error: -EINVAL);
8887	}
8888	mddev->new_level = `6`;
8889	mddev->new_layout = new_layout;
8890	mddev->delta_disks = `1`;
8891	mddev->raid_disks += `1`;
8892	return setup_conf(mddev);
8893	}
8894
8895	static int raid5_change_consistency_policy(struct mddev mddev, const* char *buf)
8896	{
8897	struct r5conf *conf;
8898	int err;
8899
8900	err = mddev_suspend_and_lock(mddev);
8901	if (err)
8902	return err;
8903	conf = mddev->private;
8904	if (!conf) {
8905	mddev_unlock_and_resume(mddev);
8906	return -ENODEV;
8907	}
8908
8909	if (strncmp(buf, "ppl", `3`) == `0`) {
8910	/ ppl only works with RAID 5 /
8911	if (!raid5_has_ppl(conf) && conf->level == `5`) {
8912	err = log_init(conf, NULL, ppl: true);
8913	if (!err) {
8914	err = resize_stripes(conf, newsize: conf->pool_size);
8915	if (err)
8916	log_exit(conf);
8917	}
8918	} else
8919	err = -EINVAL;
8920	} else if (strncmp(buf, "resync", `6`) == `0`) {
8921	if (raid5_has_ppl(conf)) {
8922	log_exit(conf);
8923	err = resize_stripes(conf, newsize: conf->pool_size);
8924	} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8925	r5l_log_disk_error(conf)) {
8926	bool journal_dev_exists = false;
8927	struct md_rdev *rdev;
8928
8929	rdev_for_each(rdev, mddev)
8930	if (test_bit(Journal, &rdev->flags)) {
8931	journal_dev_exists = true;
8932	break;
8933	}
8934
8935	if (!journal_dev_exists)
8936	clear_bit(nr: MD_HAS_JOURNAL, addr: &mddev->flags);
8937	else / need remove journal device first /
8938	err = -EBUSY;
8939	} else
8940	err = -EINVAL;
8941	} else {
8942	err = -EINVAL;
8943	}
8944
8945	if (!err)
8946	md_update_sb(mddev, force: `1`);
8947
8948	mddev_unlock_and_resume(mddev);
8949
8950	return err;
8951	}
8952
8953	static int raid5_start(struct mddev *mddev)
8954	{
8955	struct r5conf *conf = mddev->private;
8956
8957	return r5l_start(log: conf->log);
8958	}
8959
8960	/*
8961	* This is only used for dm-raid456, caller already frozen sync_thread, hence
8962	* if rehsape is still in progress, io that is waiting for reshape can never be
8963	* done now, hence wake up and handle those IO.
8964	*/
8965	static void raid5_prepare_suspend(struct mddev *mddev)
8966	{
8967	struct r5conf *conf = mddev->private;
8968
8969	wake_up(&conf->wait_for_reshape);
8970	}
8971
8972	static struct md_personality raid6_personality =
8973	{
8974	.head = {
8975	.type = MD_PERSONALITY,
8976	.id = ID_RAID6,
8977	.name = "raid6",
8978	.owner = THIS_MODULE,
8979	},
8980
8981	.make_request = raid5_make_request,
8982	.run = raid5_run,
8983	.start = raid5_start,
8984	.free = raid5_free,
8985	.status = raid5_status,
8986	.error_handler = raid5_error,
8987	.hot_add_disk = raid5_add_disk,
8988	.hot_remove_disk= raid5_remove_disk,
8989	.spare_active = raid5_spare_active,
8990	.sync_request = raid5_sync_request,
8991	.resize = raid5_resize,
8992	.size = raid5_size,
8993	.check_reshape = raid6_check_reshape,
8994	.start_reshape = raid5_start_reshape,
8995	.finish_reshape = raid5_finish_reshape,
8996	.quiesce = raid5_quiesce,
8997	.takeover = raid6_takeover,
8998	.change_consistency_policy = raid5_change_consistency_policy,
8999	.prepare_suspend = raid5_prepare_suspend,
9000	.bitmap_sector = raid5_bitmap_sector,
9001	};
9002	static struct md_personality raid5_personality =
9003	{
9004	.head = {
9005	.type = MD_PERSONALITY,
9006	.id = ID_RAID5,
9007	.name = "raid5",
9008	.owner = THIS_MODULE,
9009	},
9010
9011	.make_request = raid5_make_request,
9012	.run = raid5_run,
9013	.start = raid5_start,
9014	.free = raid5_free,
9015	.status = raid5_status,
9016	.error_handler = raid5_error,
9017	.hot_add_disk = raid5_add_disk,
9018	.hot_remove_disk= raid5_remove_disk,
9019	.spare_active = raid5_spare_active,
9020	.sync_request = raid5_sync_request,
9021	.resize = raid5_resize,
9022	.size = raid5_size,
9023	.check_reshape = raid5_check_reshape,
9024	.start_reshape = raid5_start_reshape,
9025	.finish_reshape = raid5_finish_reshape,
9026	.quiesce = raid5_quiesce,
9027	.takeover = raid5_takeover,
9028	.change_consistency_policy = raid5_change_consistency_policy,
9029	.prepare_suspend = raid5_prepare_suspend,
9030	.bitmap_sector = raid5_bitmap_sector,
9031	};
9032
9033	static struct md_personality raid4_personality =
9034	{
9035	.head = {
9036	.type = MD_PERSONALITY,
9037	.id = ID_RAID4,
9038	.name = "raid4",
9039	.owner = THIS_MODULE,
9040	},
9041
9042	.make_request = raid5_make_request,
9043	.run = raid5_run,
9044	.start = raid5_start,
9045	.free = raid5_free,
9046	.status = raid5_status,
9047	.error_handler = raid5_error,
9048	.hot_add_disk = raid5_add_disk,
9049	.hot_remove_disk= raid5_remove_disk,
9050	.spare_active = raid5_spare_active,
9051	.sync_request = raid5_sync_request,
9052	.resize = raid5_resize,
9053	.size = raid5_size,
9054	.check_reshape = raid5_check_reshape,
9055	.start_reshape = raid5_start_reshape,
9056	.finish_reshape = raid5_finish_reshape,
9057	.quiesce = raid5_quiesce,
9058	.takeover = raid4_takeover,
9059	.change_consistency_policy = raid5_change_consistency_policy,
9060	.prepare_suspend = raid5_prepare_suspend,
9061	.bitmap_sector = raid5_bitmap_sector,
9062	};
9063
9064	static int __init raid5_init(void)
9065	{
9066	int ret;
9067
9068	raid5_wq = alloc_workqueue("raid5wq",
9069	WQ_UNBOUND\|WQ_MEM_RECLAIM\|WQ_SYSFS, `0`);
9070	if (!raid5_wq)
9071	return -ENOMEM;
9072
9073	ret = cpuhp_setup_state_multi(state: CPUHP_MD_RAID5_PREPARE,
9074	name: "md/raid5:prepare",
9075	startup: raid456_cpu_up_prepare,
9076	teardown: raid456_cpu_dead);
9077	if (ret)
9078	goto err_destroy_wq;
9079
9080	ret = register_md_submodule(msh: &raid6_personality.head);
9081	if (ret)
9082	goto err_cpuhp_remove;
9083
9084	ret = register_md_submodule(msh: &raid5_personality.head);
9085	if (ret)
9086	goto err_unregister_raid6;
9087
9088	ret = register_md_submodule(msh: &raid4_personality.head);
9089	if (ret)
9090	goto err_unregister_raid5;
9091
9092	return `0`;
9093
9094	err_unregister_raid5:
9095	unregister_md_submodule(msh: &raid5_personality.head);
9096	err_unregister_raid6:
9097	unregister_md_submodule(msh: &raid6_personality.head);
9098	err_cpuhp_remove:
9099	cpuhp_remove_multi_state(state: CPUHP_MD_RAID5_PREPARE);
9100	err_destroy_wq:
9101	destroy_workqueue(wq: raid5_wq);
9102	return ret;
9103	}
9104
9105	static void __exit raid5_exit(void)
9106	{
9107	unregister_md_submodule(msh: &raid6_personality.head);
9108	unregister_md_submodule(msh: &raid5_personality.head);
9109	unregister_md_submodule(msh: &raid4_personality.head);
9110	cpuhp_remove_multi_state(state: CPUHP_MD_RAID5_PREPARE);
9111	destroy_workqueue(wq: raid5_wq);
9112	}
9113
9114	module_init(raid5_init);
9115	module_exit(raid5_exit);
9116	MODULE_LICENSE("GPL");
9117	MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
9118	MODULE_ALIAS("md-personality-4"); / RAID5 /
9119	MODULE_ALIAS("md-raid5");
9120	MODULE_ALIAS("md-raid4");
9121	MODULE_ALIAS("md-level-5");
9122	MODULE_ALIAS("md-level-4");
9123	MODULE_ALIAS("md-personality-8"); / RAID6 /
9124	MODULE_ALIAS("md-raid6");
9125	MODULE_ALIAS("md-level-6");
9126
9127	/ This used to be two separate modules, they were: /
9128	MODULE_ALIAS("raid5");
9129	MODULE_ALIAS("raid6");
9130

source code of linux/drivers/md/raid5.c