monitor.c source code [linux/fs/resctrl/monitor.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Resource Director Technology(RDT)
4	* - Monitoring code
5	*
6	* Copyright (C) 2017 Intel Corporation
7	*
8	* Author:
9	* Vikas Shivappa <vikas.shivappa@intel.com>
10	*
11	* This replaces the cqm.c based on perf but we reuse a lot of
12	* code and datastructures originally from Peter Zijlstra and Matt Fleming.
13	*
14	* More information about RDT be found in the Intel (R) x86 Architecture
15	* Software Developer Manual June 2016, volume 3, section 17.17.
16	*/
17
18	#define pr_fmt(fmt) "resctrl: " fmt
19
20	#include <linux/cpu.h>
21	#include <linux/resctrl.h>
22	#include <linux/sizes.h>
23	#include <linux/slab.h>
24
25	#include "internal.h"
26
27	#define CREATE_TRACE_POINTS
28
29	#include "monitor_trace.h"
30
31	/**
32	* struct rmid_entry - dirty tracking for all RMID.
33	* @closid: The CLOSID for this entry.
34	* @rmid: The RMID for this entry.
35	* @busy: The number of domains with cached data using this RMID.
36	* @list: Member of the rmid_free_lru list when busy == 0.
37	*
38	* Depending on the architecture the correct monitor is accessed using
39	* both @closid and @rmid, or @rmid only.
40	*
41	* Take the rdtgroup_mutex when accessing.
42	*/
43	struct rmid_entry {
44	u32 closid;
45	u32 rmid;
46	int busy;
47	struct list_head list;
48	};
49
50	/*
51	* @rmid_free_lru - A least recently used list of free RMIDs
52	* These RMIDs are guaranteed to have an occupancy less than the
53	* threshold occupancy
54	*/
55	static LIST_HEAD(rmid_free_lru);
56
57	/*
58	* @closid_num_dirty_rmid The number of dirty RMID each CLOSID has.
59	* Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
60	* Indexed by CLOSID. Protected by rdtgroup_mutex.
61	*/
62	static u32 *closid_num_dirty_rmid;
63
64	/*
65	* @rmid_limbo_count - count of currently unused but (potentially)
66	* dirty RMIDs.
67	* This counts RMIDs that no one is currently using but that
68	* may have a occupancy value > resctrl_rmid_realloc_threshold. User can
69	* change the threshold occupancy value.
70	*/
71	static unsigned int rmid_limbo_count;
72
73	/*
74	* @rmid_entry - The entry in the limbo and free lists.
75	*/
76	static struct rmid_entry *rmid_ptrs;
77
78	/*
79	* This is the threshold cache occupancy in bytes at which we will consider an
80	* RMID available for re-allocation.
81	*/
82	unsigned int resctrl_rmid_realloc_threshold;
83
84	/*
85	* This is the maximum value for the reallocation threshold, in bytes.
86	*/
87	unsigned int resctrl_rmid_realloc_limit;
88
89	/*
90	* x86 and arm64 differ in their handling of monitoring.
91	* x86's RMID are independent numbers, there is only one source of traffic
92	* with an RMID value of '1'.
93	* arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
94	* traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
95	* value is no longer unique.
96	* To account for this, resctrl uses an index. On x86 this is just the RMID,
97	* on arm64 it encodes the CLOSID and RMID. This gives a unique number.
98	*
99	* The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
100	* must accept an attempt to read every index.
101	*/
102	static inline struct rmid_entry *__rmid_entry(u32 idx)
103	{
104	struct rmid_entry *entry;
105	u32 closid, rmid;
106
107	entry = &rmid_ptrs[idx];
108	resctrl_arch_rmid_idx_decode(idx, closid: &closid, rmid: &rmid);
109
110	WARN_ON_ONCE(entry->closid != closid);
111	WARN_ON_ONCE(entry->rmid != rmid);
112
113	return entry;
114	}
115
116	static void limbo_release_entry(struct rmid_entry *entry)
117	{
118	lockdep_assert_held(&rdtgroup_mutex);
119
120	rmid_limbo_count--;
121	list_add_tail(new: &entry->list, head: &rmid_free_lru);
122
123	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
124	closid_num_dirty_rmid[entry->closid]--;
125	}
126
127	/*
128	* Check the RMIDs that are marked as busy for this domain. If the
129	* reported LLC occupancy is below the threshold clear the busy bit and
130	* decrement the count. If the busy count gets to zero on an RMID, we
131	* free the RMID
132	*/
133	void __check_limbo(struct rdt_mon_domain *d, bool force_free)
134	{
135	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
136	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
137	struct rmid_entry *entry;
138	u32 idx, cur_idx = `1`;
139	void *arch_mon_ctx;
140	bool rmid_dirty;
141	u64 val = `0`;
142
143	arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid: QOS_L3_OCCUP_EVENT_ID);
144	if (IS_ERR(ptr: arch_mon_ctx)) {
145	pr_warn_ratelimited("Failed to allocate monitor context: %ld",
146	PTR_ERR(arch_mon_ctx));
147	return;
148	}
149
150	/*
151	* Skip RMID 0 and start from RMID 1 and check all the RMIDs that
152	* are marked as busy for occupancy < threshold. If the occupancy
153	* is less than the threshold decrement the busy counter of the
154	* RMID and move it to the free list when the counter reaches 0.
155	*/
156	for (;;) {
157	idx = find_next_bit(addr: d->rmid_busy_llc, size: idx_limit, offset: cur_idx);
158	if (idx >= idx_limit)
159	break;
160
161	entry = __rmid_entry(idx);
162	if (resctrl_arch_rmid_read(r, d, closid: entry->closid, rmid: entry->rmid,
163	eventid: QOS_L3_OCCUP_EVENT_ID, val: &val,
164	arch_mon_ctx)) {
165	rmid_dirty = true;
166	} else {
167	rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
168
169	/*
170	* x86's CLOSID and RMID are independent numbers, so the entry's
171	* CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
172	* RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
173	* used to select the configuration. It is thus necessary to track both
174	* CLOSID and RMID because there may be dependencies between them
175	* on some architectures.
176	*/
177	trace_mon_llc_occupancy_limbo(ctrl_hw_id: entry->closid, mon_hw_id: entry->rmid, domain_id: d->hdr.id, llc_occupancy_bytes: val);
178	}
179
180	if (force_free \|\| !rmid_dirty) {
181	clear_bit(nr: idx, addr: d->rmid_busy_llc);
182	if (!--entry->busy)
183	limbo_release_entry(entry);
184	}
185	cur_idx = idx + `1`;
186	}
187
188	resctrl_arch_mon_ctx_free(r, evtid: QOS_L3_OCCUP_EVENT_ID, ctx: arch_mon_ctx);
189	}
190
191	bool has_busy_rmid(struct rdt_mon_domain *d)
192	{
193	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
194
195	return find_first_bit(addr: d->rmid_busy_llc, size: idx_limit) != idx_limit;
196	}
197
198	static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
199	{
200	struct rmid_entry *itr;
201	u32 itr_idx, cmp_idx;
202
203	if (list_empty(head: &rmid_free_lru))
204	return rmid_limbo_count ? ERR_PTR(error: -EBUSY) : ERR_PTR(error: -ENOSPC);
205
206	list_for_each_entry(itr, &rmid_free_lru, list) {
207	/*
208	* Get the index of this free RMID, and the index it would need
209	* to be if it were used with this CLOSID.
210	* If the CLOSID is irrelevant on this architecture, the two
211	* index values are always the same on every entry and thus the
212	* very first entry will be returned.
213	*/
214	itr_idx = resctrl_arch_rmid_idx_encode(ignored: itr->closid, rmid: itr->rmid);
215	cmp_idx = resctrl_arch_rmid_idx_encode(ignored: closid, rmid: itr->rmid);
216
217	if (itr_idx == cmp_idx)
218	return itr;
219	}
220
221	return ERR_PTR(error: -ENOSPC);
222	}
223
224	/**
225	* resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
226	* RMID are clean, or the CLOSID that has
227	* the most clean RMID.
228	*
229	* MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
230	* may not be able to allocate clean RMID. To avoid this the allocator will
231	* choose the CLOSID with the most clean RMID.
232	*
233	* When the CLOSID and RMID are independent numbers, the first free CLOSID will
234	* be returned.
235	*/
236	int resctrl_find_cleanest_closid(void)
237	{
238	u32 cleanest_closid = ~`0`;
239	int i = `0`;
240
241	lockdep_assert_held(&rdtgroup_mutex);
242
243	if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
244	return -EIO;
245
246	for (i = `0`; i < closids_supported(); i++) {
247	int num_dirty;
248
249	if (closid_allocated(closid: i))
250	continue;
251
252	num_dirty = closid_num_dirty_rmid[i];
253	if (num_dirty == `0`)
254	return i;
255
256	if (cleanest_closid == ~`0`)
257	cleanest_closid = i;
258
259	if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
260	cleanest_closid = i;
261	}
262
263	if (cleanest_closid == ~`0`)
264	return -ENOSPC;
265
266	return cleanest_closid;
267	}
268
269	/*
270	* For MPAM the RMID value is not unique, and has to be considered with
271	* the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
272	* allows all domains to be managed by a single free list.
273	* Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
274	*/
275	int alloc_rmid(u32 closid)
276	{
277	struct rmid_entry *entry;
278
279	lockdep_assert_held(&rdtgroup_mutex);
280
281	entry = resctrl_find_free_rmid(closid);
282	if (IS_ERR(ptr: entry))
283	return PTR_ERR(ptr: entry);
284
285	list_del(entry: &entry->list);
286	return entry->rmid;
287	}
288
289	static void add_rmid_to_limbo(struct rmid_entry *entry)
290	{
291	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
292	struct rdt_mon_domain *d;
293	u32 idx;
294
295	lockdep_assert_held(&rdtgroup_mutex);
296
297	/ Walking r->domains, ensure it can't race with cpuhp /
298	lockdep_assert_cpus_held();
299
300	idx = resctrl_arch_rmid_idx_encode(ignored: entry->closid, rmid: entry->rmid);
301
302	entry->busy = `0`;
303	list_for_each_entry(d, &r->mon_domains, hdr.list) {
304	/*
305	* For the first limbo RMID in the domain,
306	* setup up the limbo worker.
307	*/
308	if (!has_busy_rmid(d))
309	cqm_setup_limbo_handler(dom: d, CQM_LIMBOCHECK_INTERVAL,
310	RESCTRL_PICK_ANY_CPU);
311	set_bit(nr: idx, addr: d->rmid_busy_llc);
312	entry->busy++;
313	}
314
315	rmid_limbo_count++;
316	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
317	closid_num_dirty_rmid[entry->closid]++;
318	}
319
320	void free_rmid(u32 closid, u32 rmid)
321	{
322	u32 idx = resctrl_arch_rmid_idx_encode(ignored: closid, rmid);
323	struct rmid_entry *entry;
324
325	lockdep_assert_held(&rdtgroup_mutex);
326
327	/*
328	* Do not allow the default rmid to be free'd. Comparing by index
329	* allows architectures that ignore the closid parameter to avoid an
330	* unnecessary check.
331	*/
332	if (!resctrl_arch_mon_capable() \|\|
333	idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
334	RESCTRL_RESERVED_RMID))
335	return;
336
337	entry = __rmid_entry(idx);
338
339	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_OCCUP_EVENT_ID))
340	add_rmid_to_limbo(entry);
341	else
342	list_add_tail(new: &entry->list, head: &rmid_free_lru);
343	}
344
345	static struct mbm_state get_mbm_state(struct* rdt_mon_domain *d, u32 closid,
346	u32 rmid, enum resctrl_event_id evtid)
347	{
348	u32 idx = resctrl_arch_rmid_idx_encode(ignored: closid, rmid);
349	struct mbm_state *state;
350
351	if (!resctrl_is_mbm_event(eventid: evtid))
352	return NULL;
353
354	state = d->mbm_states[MBM_STATE_IDX(evtid)];
355
356	return state ? &state[idx] : NULL;
357	}
358
359	/*
360	* mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp.
361	*
362	* Return:
363	* Valid counter ID on success, or -ENOENT on failure.
364	*/
365	static int mbm_cntr_get(struct rdt_resource r, struct* rdt_mon_domain *d,
366	struct rdtgroup rdtgrp, enum* resctrl_event_id evtid)
367	{
368	int cntr_id;
369
370	if (!r->mon.mbm_cntr_assignable)
371	return -ENOENT;
372
373	if (!resctrl_is_mbm_event(eventid: evtid))
374	return -ENOENT;
375
376	for (cntr_id = `0`; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) {
377	if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp &&
378	d->cntr_cfg[cntr_id].evtid == evtid)
379	return cntr_id;
380	}
381
382	return -ENOENT;
383	}
384
385	/*
386	* mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d.
387	* Caller must ensure that the specified event is not assigned already.
388	*
389	* Return:
390	* Valid counter ID on success, or -ENOSPC on failure.
391	*/
392	static int mbm_cntr_alloc(struct rdt_resource r, struct* rdt_mon_domain *d,
393	struct rdtgroup rdtgrp, enum* resctrl_event_id evtid)
394	{
395	int cntr_id;
396
397	for (cntr_id = `0`; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) {
398	if (!d->cntr_cfg[cntr_id].rdtgrp) {
399	d->cntr_cfg[cntr_id].rdtgrp = rdtgrp;
400	d->cntr_cfg[cntr_id].evtid = evtid;
401	return cntr_id;
402	}
403	}
404
405	return -ENOSPC;
406	}
407
408	/*
409	* mbm_cntr_free() - Clear the counter ID configuration details in the domain @d.
410	*/
411	static void mbm_cntr_free(struct rdt_mon_domain d, int* cntr_id)
412	{
413	memset(&d->cntr_cfg[cntr_id], `0`, sizeof(*d->cntr_cfg));
414	}
415
416	static int __mon_event_count(struct rdtgroup rdtgrp, struct* rmid_read *rr)
417	{
418	int cpu = smp_processor_id();
419	u32 closid = rdtgrp->closid;
420	u32 rmid = rdtgrp->mon.rmid;
421	struct rdt_mon_domain *d;
422	int cntr_id = -ENOENT;
423	struct mbm_state *m;
424	int err, ret;
425	u64 tval = `0`;
426
427	if (rr->is_mbm_cntr) {
428	cntr_id = mbm_cntr_get(r: rr->r, d: rr->d, rdtgrp, evtid: rr->evtid);
429	if (cntr_id < `0`) {
430	rr->err = -ENOENT;
431	return -EINVAL;
432	}
433	}
434
435	if (rr->first) {
436	if (rr->is_mbm_cntr)
437	resctrl_arch_reset_cntr(r: rr->r, d: rr->d, closid, rmid, cntr_id, eventid: rr->evtid);
438	else
439	resctrl_arch_reset_rmid(r: rr->r, d: rr->d, closid, rmid, eventid: rr->evtid);
440	m = get_mbm_state(d: rr->d, closid, rmid, evtid: rr->evtid);
441	if (m)
442	memset(m, `0`, sizeof(struct mbm_state));
443	return `0`;
444	}
445
446	if (rr->d) {
447	/ Reading a single domain, must be on a CPU in that domain. /
448	if (!cpumask_test_cpu(cpu, cpumask: &rr->d->hdr.cpu_mask))
449	return -EINVAL;
450	if (rr->is_mbm_cntr)
451	rr->err = resctrl_arch_cntr_read(r: rr->r, d: rr->d, closid, rmid, cntr_id,
452	eventid: rr->evtid, val: &tval);
453	else
454	rr->err = resctrl_arch_rmid_read(r: rr->r, d: rr->d, closid, rmid,
455	eventid: rr->evtid, val: &tval, arch_mon_ctx: rr->arch_mon_ctx);
456	if (rr->err)
457	return rr->err;
458
459	rr->val += tval;
460
461	return `0`;
462	}
463
464	/ Summing domains that share a cache, must be on a CPU for that cache. /
465	if (!cpumask_test_cpu(cpu, cpumask: &rr->ci->shared_cpu_map))
466	return -EINVAL;
467
468	/*
469	* Legacy files must report the sum of an event across all
470	* domains that share the same L3 cache instance.
471	* Report success if a read from any domain succeeds, -EINVAL
472	* (translated to "Unavailable" for user space) if reading from
473	* all domains fail for any reason.
474	*/
475	ret = -EINVAL;
476	list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
477	if (d->ci_id != rr->ci->id)
478	continue;
479	if (rr->is_mbm_cntr)
480	err = resctrl_arch_cntr_read(r: rr->r, d, closid, rmid, cntr_id,
481	eventid: rr->evtid, val: &tval);
482	else
483	err = resctrl_arch_rmid_read(r: rr->r, d, closid, rmid,
484	eventid: rr->evtid, val: &tval, arch_mon_ctx: rr->arch_mon_ctx);
485	if (!err) {
486	rr->val += tval;
487	ret = `0`;
488	}
489	}
490
491	if (ret)
492	rr->err = ret;
493
494	return ret;
495	}
496
497	/*
498	* mbm_bw_count() - Update bw count from values previously read by
499	* __mon_event_count().
500	* @rdtgrp: resctrl group associated with the CLOSID and RMID to identify
501	* the cached mbm_state.
502	* @rr: The struct rmid_read populated by __mon_event_count().
503	*
504	* Supporting function to calculate the memory bandwidth
505	* and delta bandwidth in MBps. The chunks value previously read by
506	* __mon_event_count() is compared with the chunks value from the previous
507	* invocation. This must be called once per second to maintain values in MBps.
508	*/
509	static void mbm_bw_count(struct rdtgroup rdtgrp, struct* rmid_read *rr)
510	{
511	u64 cur_bw, bytes, cur_bytes;
512	u32 closid = rdtgrp->closid;
513	u32 rmid = rdtgrp->mon.rmid;
514	struct mbm_state *m;
515
516	m = get_mbm_state(d: rr->d, closid, rmid, evtid: rr->evtid);
517	if (WARN_ON_ONCE(!m))
518	return;
519
520	cur_bytes = rr->val;
521	bytes = cur_bytes - m->prev_bw_bytes;
522	m->prev_bw_bytes = cur_bytes;
523
524	cur_bw = bytes / SZ_1M;
525
526	m->prev_bw = cur_bw;
527	}
528
529	/*
530	* This is scheduled by mon_event_read() to read the CQM/MBM counters
531	* on a domain.
532	*/
533	void mon_event_count(void *info)
534	{
535	struct rdtgroup rdtgrp, entry;
536	struct rmid_read *rr = info;
537	struct list_head *head;
538	int ret;
539
540	rdtgrp = rr->rgrp;
541
542	ret = __mon_event_count(rdtgrp, rr);
543
544	/*
545	* For Ctrl groups read data from child monitor groups and
546	* add them together. Count events which are read successfully.
547	* Discard the rmid_read's reporting errors.
548	*/
549	head = &rdtgrp->mon.crdtgrp_list;
550
551	if (rdtgrp->type == RDTCTRL_GROUP) {
552	list_for_each_entry(entry, head, mon.crdtgrp_list) {
553	if (__mon_event_count(rdtgrp: entry, rr) == `0`)
554	ret = `0`;
555	}
556	}
557
558	/*
559	* __mon_event_count() calls for newly created monitor groups may
560	* report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
561	* Discard error if any of the monitor event reads succeeded.
562	*/
563	if (ret == `0`)
564	rr->err = `0`;
565	}
566
567	static struct rdt_ctrl_domain get_ctrl_domain_from_cpu(int* cpu,
568	struct rdt_resource *r)
569	{
570	struct rdt_ctrl_domain *d;
571
572	lockdep_assert_cpus_held();
573
574	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
575	/ Find the domain that contains this CPU /
576	if (cpumask_test_cpu(cpu, cpumask: &d->hdr.cpu_mask))
577	return d;
578	}
579
580	return NULL;
581	}
582
583	/*
584	* Feedback loop for MBA software controller (mba_sc)
585	*
586	* mba_sc is a feedback loop where we periodically read MBM counters and
587	* adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
588	* that:
589	*
590	* current bandwidth(cur_bw) < user specified bandwidth(user_bw)
591	*
592	* This uses the MBM counters to measure the bandwidth and MBA throttle
593	* MSRs to control the bandwidth for a particular rdtgrp. It builds on the
594	* fact that resctrl rdtgroups have both monitoring and control.
595	*
596	* The frequency of the checks is 1s and we just tag along the MBM overflow
597	* timer. Having 1s interval makes the calculation of bandwidth simpler.
598	*
599	* Although MBA's goal is to restrict the bandwidth to a maximum, there may
600	* be a need to increase the bandwidth to avoid unnecessarily restricting
601	* the L2 <-> L3 traffic.
602	*
603	* Since MBA controls the L2 external bandwidth where as MBM measures the
604	* L3 external bandwidth the following sequence could lead to such a
605	* situation.
606	*
607	* Consider an rdtgroup which had high L3 <-> memory traffic in initial
608	* phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
609	* after some time rdtgroup has mostly L2 <-> L3 traffic.
610	*
611	* In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
612	* throttle MSRs already have low percentage values. To avoid
613	* unnecessarily restricting such rdtgroups, we also increase the bandwidth.
614	*/
615	static void update_mba_bw(struct rdtgroup rgrp, struct* rdt_mon_domain *dom_mbm)
616	{
617	u32 closid, rmid, cur_msr_val, new_msr_val;
618	struct mbm_state pmbm_data, cmbm_data;
619	struct rdt_ctrl_domain *dom_mba;
620	enum resctrl_event_id evt_id;
621	struct rdt_resource *r_mba;
622	struct list_head *head;
623	struct rdtgroup *entry;
624	u32 cur_bw, user_bw;
625
626	r_mba = resctrl_arch_get_resource(l: RDT_RESOURCE_MBA);
627	evt_id = rgrp->mba_mbps_event;
628
629	closid = rgrp->closid;
630	rmid = rgrp->mon.rmid;
631	pmbm_data = get_mbm_state(d: dom_mbm, closid, rmid, evtid: evt_id);
632	if (WARN_ON_ONCE(!pmbm_data))
633	return;
634
635	dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r: r_mba);
636	if (!dom_mba) {
637	pr_warn_once("Failure to get domain for MBA update\n");
638	return;
639	}
640
641	cur_bw = pmbm_data->prev_bw;
642	user_bw = dom_mba->mbps_val[closid];
643
644	/ MBA resource doesn't support CDP /
645	cur_msr_val = resctrl_arch_get_config(r: r_mba, d: dom_mba, closid, type: CDP_NONE);
646
647	/*
648	* For Ctrl groups read data from child monitor groups.
649	*/
650	head = &rgrp->mon.crdtgrp_list;
651	list_for_each_entry(entry, head, mon.crdtgrp_list) {
652	cmbm_data = get_mbm_state(d: dom_mbm, closid: entry->closid, rmid: entry->mon.rmid, evtid: evt_id);
653	if (WARN_ON_ONCE(!cmbm_data))
654	return;
655	cur_bw += cmbm_data->prev_bw;
656	}
657
658	/*
659	* Scale up/down the bandwidth linearly for the ctrl group. The
660	* bandwidth step is the bandwidth granularity specified by the
661	* hardware.
662	* Always increase throttling if current bandwidth is above the
663	* target set by user.
664	* But avoid thrashing up and down on every poll by checking
665	* whether a decrease in throttling is likely to push the group
666	* back over target. E.g. if currently throttling to 30% of bandwidth
667	* on a system with 10% granularity steps, check whether moving to
668	* 40% would go past the limit by multiplying current bandwidth by
669	* "(30 + 10) / 30".
670	*/
671	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
672	new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
673	} else if (cur_msr_val < MAX_MBA_BW &&
674	(user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
675	new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
676	} else {
677	return;
678	}
679
680	resctrl_arch_update_one(r: r_mba, d: dom_mba, closid, t: CDP_NONE, cfg_val: new_msr_val);
681	}
682
683	static void mbm_update_one_event(struct rdt_resource r, struct* rdt_mon_domain *d,
684	struct rdtgroup rdtgrp, enum* resctrl_event_id evtid)
685	{
686	struct rmid_read rr = {`0`};
687
688	rr.r = r;
689	rr.d = d;
690	rr.evtid = evtid;
691	if (resctrl_arch_mbm_cntr_assign_enabled(r)) {
692	rr.is_mbm_cntr = true;
693	} else {
694	rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r: rr.r, evtid: rr.evtid);
695	if (IS_ERR(ptr: rr.arch_mon_ctx)) {
696	pr_warn_ratelimited("Failed to allocate monitor context: %ld",
697	PTR_ERR(rr.arch_mon_ctx));
698	return;
699	}
700	}
701
702	__mon_event_count(rdtgrp, rr: &rr);
703
704	/*
705	* If the software controller is enabled, compute the
706	* bandwidth for this event id.
707	*/
708	if (is_mba_sc(NULL))
709	mbm_bw_count(rdtgrp, rr: &rr);
710
711	if (rr.arch_mon_ctx)
712	resctrl_arch_mon_ctx_free(r: rr.r, evtid: rr.evtid, ctx: rr.arch_mon_ctx);
713	}
714
715	static void mbm_update(struct rdt_resource r, struct* rdt_mon_domain *d,
716	struct rdtgroup *rdtgrp)
717	{
718	/*
719	* This is protected from concurrent reads from user as both
720	* the user and overflow handler hold the global mutex.
721	*/
722	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_TOTAL_EVENT_ID))
723	mbm_update_one_event(r, d, rdtgrp, evtid: QOS_L3_MBM_TOTAL_EVENT_ID);
724
725	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_LOCAL_EVENT_ID))
726	mbm_update_one_event(r, d, rdtgrp, evtid: QOS_L3_MBM_LOCAL_EVENT_ID);
727	}
728
729	/*
730	* Handler to scan the limbo list and move the RMIDs
731	* to free list whose occupancy < threshold_occupancy.
732	*/
733	void cqm_handle_limbo(struct work_struct *work)
734	{
735	unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
736	struct rdt_mon_domain *d;
737
738	cpus_read_lock();
739	mutex_lock(&rdtgroup_mutex);
740
741	d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
742
743	__check_limbo(d, force_free: false);
744
745	if (has_busy_rmid(d)) {
746	d->cqm_work_cpu = cpumask_any_housekeeping(mask: &d->hdr.cpu_mask,
747	RESCTRL_PICK_ANY_CPU);
748	schedule_delayed_work_on(cpu: d->cqm_work_cpu, dwork: &d->cqm_limbo,
749	delay);
750	}
751
752	mutex_unlock(lock: &rdtgroup_mutex);
753	cpus_read_unlock();
754	}
755
756	/**
757	* cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
758	* domain.
759	* @dom: The domain the limbo handler should run for.
760	* @delay_ms: How far in the future the handler should run.
761	* @exclude_cpu: Which CPU the handler should not run on,
762	* RESCTRL_PICK_ANY_CPU to pick any CPU.
763	*/
764	void cqm_setup_limbo_handler(struct rdt_mon_domain dom, unsigned* long delay_ms,
765	int exclude_cpu)
766	{
767	unsigned long delay = msecs_to_jiffies(m: delay_ms);
768	int cpu;
769
770	cpu = cpumask_any_housekeeping(mask: &dom->hdr.cpu_mask, exclude_cpu);
771	dom->cqm_work_cpu = cpu;
772
773	if (cpu < nr_cpu_ids)
774	schedule_delayed_work_on(cpu, dwork: &dom->cqm_limbo, delay);
775	}
776
777	void mbm_handle_overflow(struct work_struct *work)
778	{
779	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
780	struct rdtgroup prgrp, crgrp;
781	struct rdt_mon_domain *d;
782	struct list_head *head;
783	struct rdt_resource *r;
784
785	cpus_read_lock();
786	mutex_lock(&rdtgroup_mutex);
787
788	/*
789	* If the filesystem has been unmounted this work no longer needs to
790	* run.
791	*/
792	if (!resctrl_mounted \|\| !resctrl_arch_mon_capable())
793	goto out_unlock;
794
795	r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
796	d = container_of(work, struct rdt_mon_domain, mbm_over.work);
797
798	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
799	mbm_update(r, d, rdtgrp: prgrp);
800
801	head = &prgrp->mon.crdtgrp_list;
802	list_for_each_entry(crgrp, head, mon.crdtgrp_list)
803	mbm_update(r, d, rdtgrp: crgrp);
804
805	if (is_mba_sc(NULL))
806	update_mba_bw(rgrp: prgrp, dom_mbm: d);
807	}
808
809	/*
810	* Re-check for housekeeping CPUs. This allows the overflow handler to
811	* move off a nohz_full CPU quickly.
812	*/
813	d->mbm_work_cpu = cpumask_any_housekeeping(mask: &d->hdr.cpu_mask,
814	RESCTRL_PICK_ANY_CPU);
815	schedule_delayed_work_on(cpu: d->mbm_work_cpu, dwork: &d->mbm_over, delay);
816
817	out_unlock:
818	mutex_unlock(lock: &rdtgroup_mutex);
819	cpus_read_unlock();
820	}
821
822	/**
823	* mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
824	* domain.
825	* @dom: The domain the overflow handler should run for.
826	* @delay_ms: How far in the future the handler should run.
827	* @exclude_cpu: Which CPU the handler should not run on,
828	* RESCTRL_PICK_ANY_CPU to pick any CPU.
829	*/
830	void mbm_setup_overflow_handler(struct rdt_mon_domain dom, unsigned* long delay_ms,
831	int exclude_cpu)
832	{
833	unsigned long delay = msecs_to_jiffies(m: delay_ms);
834	int cpu;
835
836	/*
837	* When a domain comes online there is no guarantee the filesystem is
838	* mounted. If not, there is no need to catch counter overflow.
839	*/
840	if (!resctrl_mounted \|\| !resctrl_arch_mon_capable())
841	return;
842	cpu = cpumask_any_housekeeping(mask: &dom->hdr.cpu_mask, exclude_cpu);
843	dom->mbm_work_cpu = cpu;
844
845	if (cpu < nr_cpu_ids)
846	schedule_delayed_work_on(cpu, dwork: &dom->mbm_over, delay);
847	}
848
849	static int dom_data_init(struct rdt_resource *r)
850	{
851	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
852	u32 num_closid = resctrl_arch_get_num_closid(r);
853	struct rmid_entry *entry = NULL;
854	int err = `0`, i;
855	u32 idx;
856
857	mutex_lock(&rdtgroup_mutex);
858	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
859	u32 *tmp;
860
861	/*
862	* If the architecture hasn't provided a sanitised value here,
863	* this may result in larger arrays than necessary. Resctrl will
864	* use a smaller system wide value based on the resources in
865	* use.
866	*/
867	tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
868	if (!tmp) {
869	err = -ENOMEM;
870	goto out_unlock;
871	}
872
873	closid_num_dirty_rmid = tmp;
874	}
875
876	rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
877	if (!rmid_ptrs) {
878	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
879	kfree(objp: closid_num_dirty_rmid);
880	closid_num_dirty_rmid = NULL;
881	}
882	err = -ENOMEM;
883	goto out_unlock;
884	}
885
886	for (i = `0`; i < idx_limit; i++) {
887	entry = &rmid_ptrs[i];
888	INIT_LIST_HEAD(list: &entry->list);
889
890	resctrl_arch_rmid_idx_decode(idx: i, closid: &entry->closid, rmid: &entry->rmid);
891	list_add_tail(new: &entry->list, head: &rmid_free_lru);
892	}
893
894	/*
895	* RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
896	* are always allocated. These are used for the rdtgroup_default
897	* control group, which will be setup later in resctrl_init().
898	*/
899	idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
900	RESCTRL_RESERVED_RMID);
901	entry = __rmid_entry(idx);
902	list_del(entry: &entry->list);
903
904	out_unlock:
905	mutex_unlock(lock: &rdtgroup_mutex);
906
907	return err;
908	}
909
910	static void dom_data_exit(struct rdt_resource *r)
911	{
912	mutex_lock(&rdtgroup_mutex);
913
914	if (!r->mon_capable)
915	goto out_unlock;
916
917	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
918	kfree(objp: closid_num_dirty_rmid);
919	closid_num_dirty_rmid = NULL;
920	}
921
922	kfree(objp: rmid_ptrs);
923	rmid_ptrs = NULL;
924
925	out_unlock:
926	mutex_unlock(lock: &rdtgroup_mutex);
927	}
928
929	/*
930	* All available events. Architecture code marks the ones that
931	* are supported by a system using resctrl_enable_mon_event()
932	* to set .enabled.
933	*/
934	struct mon_evt mon_event_all[QOS_NUM_EVENTS] = {
935	[QOS_L3_OCCUP_EVENT_ID] = {
936	.name = "llc_occupancy",
937	.evtid = QOS_L3_OCCUP_EVENT_ID,
938	.rid = RDT_RESOURCE_L3,
939	},
940	[QOS_L3_MBM_TOTAL_EVENT_ID] = {
941	.name = "mbm_total_bytes",
942	.evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
943	.rid = RDT_RESOURCE_L3,
944	},
945	[QOS_L3_MBM_LOCAL_EVENT_ID] = {
946	.name = "mbm_local_bytes",
947	.evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
948	.rid = RDT_RESOURCE_L3,
949	},
950	};
951
952	void resctrl_enable_mon_event(enum resctrl_event_id eventid)
953	{
954	if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT \|\| eventid >= QOS_NUM_EVENTS))
955	return;
956	if (mon_event_all[eventid].enabled) {
957	pr_warn("Duplicate enable for event %d\n", eventid);
958	return;
959	}
960
961	mon_event_all[eventid].enabled = true;
962	}
963
964	bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid)
965	{
966	return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS &&
967	mon_event_all[eventid].enabled;
968	}
969
970	u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid)
971	{
972	return mon_event_all[evtid].evt_cfg;
973	}
974
975	/**
976	* struct mbm_transaction - Memory transaction an MBM event can be configured with.
977	* @name: Name of memory transaction (read, write ...).
978	* @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to
979	* represent the memory transaction within an event's configuration.
980	*/
981	struct mbm_transaction {
982	char name[`32`];
983	u32 val;
984	};
985
986	/ Decoded values for each type of memory transaction. /
987	static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = {
988	{"local_reads", READS_TO_LOCAL_MEM},
989	{"remote_reads", READS_TO_REMOTE_MEM},
990	{"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM},
991	{"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM},
992	{"local_reads_slow_memory", READS_TO_LOCAL_S_MEM},
993	{"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM},
994	{"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM},
995	};
996
997	int event_filter_show(struct kernfs_open_file of, struct* seq_file seq, void* *v)
998	{
999	struct mon_evt *mevt = rdt_kn_parent_priv(kn: of->kn);
1000	struct rdt_resource *r;
1001	bool sep = false;
1002	int ret = `0`, i;
1003
1004	mutex_lock(&rdtgroup_mutex);
1005	rdt_last_cmd_clear();
1006
1007	r = resctrl_arch_get_resource(l: mevt->rid);
1008	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
1009	rdt_last_cmd_puts(s: "mbm_event counter assignment mode is not enabled\n");
1010	ret = -EINVAL;
1011	goto out_unlock;
1012	}
1013
1014	for (i = `0`; i < NUM_MBM_TRANSACTIONS; i++) {
1015	if (mevt->evt_cfg & mbm_transactions[i].val) {
1016	if (sep)
1017	seq_putc(m: seq, c: `','`);
1018	seq_printf(m: seq, fmt: "%s", mbm_transactions[i].name);
1019	sep = true;
1020	}
1021	}
1022	seq_putc(m: seq, c: `'\n'`);
1023
1024	out_unlock:
1025	mutex_unlock(lock: &rdtgroup_mutex);
1026
1027	return ret;
1028	}
1029
1030	int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file of, struct* seq_file *s,
1031	void *v)
1032	{
1033	struct rdt_resource *r = rdt_kn_parent_priv(kn: of->kn);
1034	int ret = `0`;
1035
1036	mutex_lock(&rdtgroup_mutex);
1037	rdt_last_cmd_clear();
1038
1039	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
1040	rdt_last_cmd_puts(s: "mbm_event counter assignment mode is not enabled\n");
1041	ret = -EINVAL;
1042	goto out_unlock;
1043	}
1044
1045	seq_printf(m: s, fmt: "%u\n", r->mon.mbm_assign_on_mkdir);
1046
1047	out_unlock:
1048	mutex_unlock(lock: &rdtgroup_mutex);
1049
1050	return ret;
1051	}
1052
1053	ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file of, char* *buf,
1054	size_t nbytes, loff_t off)
1055	{
1056	struct rdt_resource *r = rdt_kn_parent_priv(kn: of->kn);
1057	bool value;
1058	int ret;
1059
1060	ret = kstrtobool(s: buf, res: &value);
1061	if (ret)
1062	return ret;
1063
1064	mutex_lock(&rdtgroup_mutex);
1065	rdt_last_cmd_clear();
1066
1067	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
1068	rdt_last_cmd_puts(s: "mbm_event counter assignment mode is not enabled\n");
1069	ret = -EINVAL;
1070	goto out_unlock;
1071	}
1072
1073	r->mon.mbm_assign_on_mkdir = value;
1074
1075	out_unlock:
1076	mutex_unlock(lock: &rdtgroup_mutex);
1077
1078	return ret ?: nbytes;
1079	}
1080
1081	/*
1082	* mbm_cntr_free_all() - Clear all the counter ID configuration details in the
1083	* domain @d. Called when mbm_assign_mode is changed.
1084	*/
1085	static void mbm_cntr_free_all(struct rdt_resource r, struct* rdt_mon_domain *d)
1086	{
1087	memset(d->cntr_cfg, `0`, sizeof(d->cntr_cfg) r->mon.num_mbm_cntrs);
1088	}
1089
1090	/*
1091	* resctrl_reset_rmid_all() - Reset all non-architecture states for all the
1092	* supported RMIDs.
1093	*/
1094	static void resctrl_reset_rmid_all(struct rdt_resource r, struct* rdt_mon_domain *d)
1095	{
1096	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
1097	enum resctrl_event_id evt;
1098	int idx;
1099
1100	for_each_mbm_event_id(evt) {
1101	if (!resctrl_is_mon_event_enabled(eventid: evt))
1102	continue;
1103	idx = MBM_STATE_IDX(evt);
1104	memset(d->mbm_states[idx], `0`, sizeof(d->mbm_states[`0`]) idx_limit);
1105	}
1106	}
1107
1108	/*
1109	* rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID
1110	* pair in the domain.
1111	*
1112	* Assign the counter if @assign is true else unassign the counter. Reset the
1113	* associated non-architectural state.
1114	*/
1115	static void rdtgroup_assign_cntr(struct rdt_resource r, struct* rdt_mon_domain *d,
1116	enum resctrl_event_id evtid, u32 rmid, u32 closid,
1117	u32 cntr_id, bool assign)
1118	{
1119	struct mbm_state *m;
1120
1121	resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign);
1122
1123	m = get_mbm_state(d, closid, rmid, evtid);
1124	if (m)
1125	memset(m, `0`, sizeof(*m));
1126	}
1127
1128	/*
1129	* rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event
1130	* pointed to by @mevt and the resctrl group @rdtgrp within the domain @d.
1131	*
1132	* Return:
1133	* 0 on success, < 0 on failure.
1134	*/
1135	static int rdtgroup_alloc_assign_cntr(struct rdt_resource r, struct* rdt_mon_domain *d,
1136	struct rdtgroup rdtgrp, struct* mon_evt *mevt)
1137	{
1138	int cntr_id;
1139
1140	/ No action required if the counter is assigned already. /
1141	cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid: mevt->evtid);
1142	if (cntr_id >= `0`)
1143	return `0`;
1144
1145	cntr_id = mbm_cntr_alloc(r, d, rdtgrp, evtid: mevt->evtid);
1146	if (cntr_id < `0`) {
1147	rdt_last_cmd_printf(fmt: "Failed to allocate counter for %s in domain %d\n",
1148	mevt->name, d->hdr.id);
1149	return cntr_id;
1150	}
1151
1152	rdtgroup_assign_cntr(r, d, evtid: mevt->evtid, rmid: rdtgrp->mon.rmid, closid: rdtgrp->closid, cntr_id, assign: true);
1153
1154	return `0`;
1155	}
1156
1157	/*
1158	* rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in
1159	* @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is
1160	* NULL; otherwise, assign the counter to the specified domain @d.
1161	*
1162	* If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr()
1163	* will fail. The assignment process will abort at the first failure encountered
1164	* during domain traversal, which may result in the event being only partially
1165	* assigned.
1166	*
1167	* Return:
1168	* 0 on success, < 0 on failure.
1169	*/
1170	static int rdtgroup_assign_cntr_event(struct rdt_mon_domain d, struct* rdtgroup *rdtgrp,
1171	struct mon_evt *mevt)
1172	{
1173	struct rdt_resource *r = resctrl_arch_get_resource(l: mevt->rid);
1174	int ret = `0`;
1175
1176	if (!d) {
1177	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1178	ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt);
1179	if (ret)
1180	return ret;
1181	}
1182	} else {
1183	ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt);
1184	}
1185
1186	return ret;
1187	}
1188
1189	/*
1190	* rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when
1191	* a new group is created.
1192	*
1193	* Each group can accommodate two counters per domain: one for the total
1194	* event and one for the local event. Assignments may fail due to the limited
1195	* number of counters. However, it is not necessary to fail the group creation
1196	* and thus no failure is returned. Users have the option to modify the
1197	* counter assignments after the group has been created.
1198	*/
1199	void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp)
1200	{
1201	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
1202
1203	if (!r->mon_capable \|\| !resctrl_arch_mbm_cntr_assign_enabled(r) \|\|
1204	!r->mon.mbm_assign_on_mkdir)
1205	return;
1206
1207	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_TOTAL_EVENT_ID))
1208	rdtgroup_assign_cntr_event(NULL, rdtgrp,
1209	mevt: &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]);
1210
1211	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_LOCAL_EVENT_ID))
1212	rdtgroup_assign_cntr_event(NULL, rdtgrp,
1213	mevt: &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
1214	}
1215
1216	/*
1217	* rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration
1218	* for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp.
1219	*/
1220	static void rdtgroup_free_unassign_cntr(struct rdt_resource r, struct* rdt_mon_domain *d,
1221	struct rdtgroup rdtgrp, struct* mon_evt *mevt)
1222	{
1223	int cntr_id;
1224
1225	cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid: mevt->evtid);
1226
1227	/ If there is no cntr_id assigned, nothing to do /
1228	if (cntr_id < `0`)
1229	return;
1230
1231	rdtgroup_assign_cntr(r, d, evtid: mevt->evtid, rmid: rdtgrp->mon.rmid, closid: rdtgrp->closid, cntr_id, assign: false);
1232
1233	mbm_cntr_free(d, cntr_id);
1234	}
1235
1236	/*
1237	* rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with
1238	* the event structure @mevt from the domain @d and the group @rdtgrp. Unassign
1239	* the counters from all the domains if @d is NULL else unassign from @d.
1240	*/
1241	static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain d, struct* rdtgroup *rdtgrp,
1242	struct mon_evt *mevt)
1243	{
1244	struct rdt_resource *r = resctrl_arch_get_resource(l: mevt->rid);
1245
1246	if (!d) {
1247	list_for_each_entry(d, &r->mon_domains, hdr.list)
1248	rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
1249	} else {
1250	rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
1251	}
1252	}
1253
1254	/*
1255	* rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events.
1256	* Called when a group is deleted.
1257	*/
1258	void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp)
1259	{
1260	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
1261
1262	if (!r->mon_capable \|\| !resctrl_arch_mbm_cntr_assign_enabled(r))
1263	return;
1264
1265	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_TOTAL_EVENT_ID))
1266	rdtgroup_unassign_cntr_event(NULL, rdtgrp,
1267	mevt: &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]);
1268
1269	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_LOCAL_EVENT_ID))
1270	rdtgroup_unassign_cntr_event(NULL, rdtgrp,
1271	mevt: &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
1272	}
1273
1274	static int resctrl_parse_mem_transactions(char tok, u32 val)
1275	{
1276	u32 temp_val = `0`;
1277	char *evt_str;
1278	bool found;
1279	int i;
1280
1281	next_config:
1282	if (!tok \|\| tok[`0`] == `'\0'`) {
1283	*val = temp_val;
1284	return `0`;
1285	}
1286
1287	/ Start processing the strings for each memory transaction type /
1288	evt_str = strim(strsep(&tok, ","));
1289	found = false;
1290	for (i = `0`; i < NUM_MBM_TRANSACTIONS; i++) {
1291	if (!strcmp(mbm_transactions[i].name, evt_str)) {
1292	temp_val \|= mbm_transactions[i].val;
1293	found = true;
1294	break;
1295	}
1296	}
1297
1298	if (!found) {
1299	rdt_last_cmd_printf(fmt: "Invalid memory transaction type %s\n", evt_str);
1300	return -EINVAL;
1301	}
1302
1303	goto next_config;
1304	}
1305
1306	/*
1307	* rdtgroup_update_cntr_event - Update the counter assignments for the event
1308	* in a group.
1309	* @r: Resource to which update needs to be done.
1310	* @rdtgrp: Resctrl group.
1311	* @evtid: MBM monitor event.
1312	*/
1313	static void rdtgroup_update_cntr_event(struct rdt_resource r, struct* rdtgroup *rdtgrp,
1314	enum resctrl_event_id evtid)
1315	{
1316	struct rdt_mon_domain *d;
1317	int cntr_id;
1318
1319	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1320	cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid);
1321	if (cntr_id >= `0`)
1322	rdtgroup_assign_cntr(r, d, evtid, rmid: rdtgrp->mon.rmid,
1323	closid: rdtgrp->closid, cntr_id, assign: true);
1324	}
1325	}
1326
1327	/*
1328	* resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event
1329	* for all the groups.
1330	* @mevt MBM Monitor event.
1331	*/
1332	static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt)
1333	{
1334	struct rdt_resource *r = resctrl_arch_get_resource(l: mevt->rid);
1335	struct rdtgroup prgrp, crgrp;
1336
1337	/*
1338	* Find all the groups where the event is assigned and update the
1339	* configuration of existing assignments.
1340	*/
1341	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
1342	rdtgroup_update_cntr_event(r, rdtgrp: prgrp, evtid: mevt->evtid);
1343
1344	list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
1345	rdtgroup_update_cntr_event(r, rdtgrp: crgrp, evtid: mevt->evtid);
1346	}
1347	}
1348
1349	ssize_t event_filter_write(struct kernfs_open_file of, char* *buf, size_t nbytes,
1350	loff_t off)
1351	{
1352	struct mon_evt *mevt = rdt_kn_parent_priv(kn: of->kn);
1353	struct rdt_resource *r;
1354	u32 evt_cfg = `0`;
1355	int ret = `0`;
1356
1357	/ Valid input requires a trailing newline /
1358	if (nbytes == `0` \|\| buf[nbytes - `1`] != `'\n'`)
1359	return -EINVAL;
1360
1361	buf[nbytes - `1`] = `'\0'`;
1362
1363	cpus_read_lock();
1364	mutex_lock(&rdtgroup_mutex);
1365
1366	rdt_last_cmd_clear();
1367
1368	r = resctrl_arch_get_resource(l: mevt->rid);
1369	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
1370	rdt_last_cmd_puts(s: "mbm_event counter assignment mode is not enabled\n");
1371	ret = -EINVAL;
1372	goto out_unlock;
1373	}
1374
1375	ret = resctrl_parse_mem_transactions(tok: buf, val: &evt_cfg);
1376	if (!ret && mevt->evt_cfg != evt_cfg) {
1377	mevt->evt_cfg = evt_cfg;
1378	resctrl_update_cntr_allrdtgrp(mevt);
1379	}
1380
1381	out_unlock:
1382	mutex_unlock(lock: &rdtgroup_mutex);
1383	cpus_read_unlock();
1384
1385	return ret ?: nbytes;
1386	}
1387
1388	int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of,
1389	struct seq_file s, void* *v)
1390	{
1391	struct rdt_resource *r = rdt_kn_parent_priv(kn: of->kn);
1392	bool enabled;
1393
1394	mutex_lock(&rdtgroup_mutex);
1395	enabled = resctrl_arch_mbm_cntr_assign_enabled(r);
1396
1397	if (r->mon.mbm_cntr_assignable) {
1398	if (enabled)
1399	seq_puts(m: s, s: "[mbm_event]\n");
1400	else
1401	seq_puts(m: s, s: "[default]\n");
1402
1403	if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) {
1404	if (enabled)
1405	seq_puts(m: s, s: "default\n");
1406	else
1407	seq_puts(m: s, s: "mbm_event\n");
1408	}
1409	} else {
1410	seq_puts(m: s, s: "[default]\n");
1411	}
1412
1413	mutex_unlock(lock: &rdtgroup_mutex);
1414
1415	return `0`;
1416	}
1417
1418	ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file of, char* *buf,
1419	size_t nbytes, loff_t off)
1420	{
1421	struct rdt_resource *r = rdt_kn_parent_priv(kn: of->kn);
1422	struct rdt_mon_domain *d;
1423	int ret = `0`;
1424	bool enable;
1425
1426	/ Valid input requires a trailing newline /
1427	if (nbytes == `0` \|\| buf[nbytes - `1`] != `'\n'`)
1428	return -EINVAL;
1429
1430	buf[nbytes - `1`] = `'\0'`;
1431
1432	cpus_read_lock();
1433	mutex_lock(&rdtgroup_mutex);
1434
1435	rdt_last_cmd_clear();
1436
1437	if (!strcmp(buf, "default")) {
1438	enable = `0`;
1439	} else if (!strcmp(buf, "mbm_event")) {
1440	if (r->mon.mbm_cntr_assignable) {
1441	enable = `1`;
1442	} else {
1443	ret = -EINVAL;
1444	rdt_last_cmd_puts(s: "mbm_event mode is not supported\n");
1445	goto out_unlock;
1446	}
1447	} else {
1448	ret = -EINVAL;
1449	rdt_last_cmd_puts(s: "Unsupported assign mode\n");
1450	goto out_unlock;
1451	}
1452
1453	if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) {
1454	ret = resctrl_arch_mbm_cntr_assign_set(r, enable);
1455	if (ret)
1456	goto out_unlock;
1457
1458	/ Update the visibility of BMEC related files /
1459	resctrl_bmec_files_show(r, NULL, show: !enable);
1460
1461	/*
1462	* Initialize the default memory transaction values for
1463	* total and local events.
1464	*/
1465	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_TOTAL_EVENT_ID))
1466	mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
1467	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_LOCAL_EVENT_ID))
1468	mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
1469	(READS_TO_LOCAL_MEM \|
1470	READS_TO_LOCAL_S_MEM \|
1471	NON_TEMP_WRITE_TO_LOCAL_MEM);
1472	/ Enable auto assignment when switching to "mbm_event" mode /
1473	if (enable)
1474	r->mon.mbm_assign_on_mkdir = true;
1475	/*
1476	* Reset all the non-achitectural RMID state and assignable counters.
1477	*/
1478	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1479	mbm_cntr_free_all(r, d);
1480	resctrl_reset_rmid_all(r, d);
1481	}
1482	}
1483
1484	out_unlock:
1485	mutex_unlock(lock: &rdtgroup_mutex);
1486	cpus_read_unlock();
1487
1488	return ret ?: nbytes;
1489	}
1490
1491	int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of,
1492	struct seq_file s, void* *v)
1493	{
1494	struct rdt_resource *r = rdt_kn_parent_priv(kn: of->kn);
1495	struct rdt_mon_domain *dom;
1496	bool sep = false;
1497
1498	cpus_read_lock();
1499	mutex_lock(&rdtgroup_mutex);
1500
1501	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
1502	if (sep)
1503	seq_putc(m: s, c: `';'`);
1504
1505	seq_printf(m: s, fmt: "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs);
1506	sep = true;
1507	}
1508	seq_putc(m: s, c: `'\n'`);
1509
1510	mutex_unlock(lock: &rdtgroup_mutex);
1511	cpus_read_unlock();
1512	return `0`;
1513	}
1514
1515	int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of,
1516	struct seq_file s, void* *v)
1517	{
1518	struct rdt_resource *r = rdt_kn_parent_priv(kn: of->kn);
1519	struct rdt_mon_domain *dom;
1520	bool sep = false;
1521	u32 cntrs, i;
1522	int ret = `0`;
1523
1524	cpus_read_lock();
1525	mutex_lock(&rdtgroup_mutex);
1526
1527	rdt_last_cmd_clear();
1528
1529	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
1530	rdt_last_cmd_puts(s: "mbm_event counter assignment mode is not enabled\n");
1531	ret = -EINVAL;
1532	goto out_unlock;
1533	}
1534
1535	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
1536	if (sep)
1537	seq_putc(m: s, c: `';'`);
1538
1539	cntrs = `0`;
1540	for (i = `0`; i < r->mon.num_mbm_cntrs; i++) {
1541	if (!dom->cntr_cfg[i].rdtgrp)
1542	cntrs++;
1543	}
1544
1545	seq_printf(m: s, fmt: "%d=%u", dom->hdr.id, cntrs);
1546	sep = true;
1547	}
1548	seq_putc(m: s, c: `'\n'`);
1549
1550	out_unlock:
1551	mutex_unlock(lock: &rdtgroup_mutex);
1552	cpus_read_unlock();
1553
1554	return ret;
1555	}
1556
1557	int mbm_L3_assignments_show(struct kernfs_open_file of, struct* seq_file s, void* *v)
1558	{
1559	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
1560	struct rdt_mon_domain *d;
1561	struct rdtgroup *rdtgrp;
1562	struct mon_evt *mevt;
1563	int ret = `0`;
1564	bool sep;
1565
1566	rdtgrp = rdtgroup_kn_lock_live(kn: of->kn);
1567	if (!rdtgrp) {
1568	ret = -ENOENT;
1569	goto out_unlock;
1570	}
1571
1572	rdt_last_cmd_clear();
1573	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
1574	rdt_last_cmd_puts(s: "mbm_event counter assignment mode is not enabled\n");
1575	ret = -EINVAL;
1576	goto out_unlock;
1577	}
1578
1579	for_each_mon_event(mevt) {
1580	if (mevt->rid != r->rid \|\| !mevt->enabled \|\| !resctrl_is_mbm_event(eventid: mevt->evtid))
1581	continue;
1582
1583	sep = false;
1584	seq_printf(m: s, fmt: "%s:", mevt->name);
1585	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1586	if (sep)
1587	seq_putc(m: s, c: `';'`);
1588
1589	if (mbm_cntr_get(r, d, rdtgrp, evtid: mevt->evtid) < `0`)
1590	seq_printf(m: s, fmt: "%d=_", d->hdr.id);
1591	else
1592	seq_printf(m: s, fmt: "%d=e", d->hdr.id);
1593
1594	sep = true;
1595	}
1596	seq_putc(m: s, c: `'\n'`);
1597	}
1598
1599	out_unlock:
1600	rdtgroup_kn_unlock(kn: of->kn);
1601
1602	return ret;
1603	}
1604
1605	/*
1606	* mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching
1607	* event name.
1608	*/
1609	static struct mon_evt mbm_get_mon_event_by_name(struct* rdt_resource r, char* *name)
1610	{
1611	struct mon_evt *mevt;
1612
1613	for_each_mon_event(mevt) {
1614	if (mevt->rid == r->rid && mevt->enabled &&
1615	resctrl_is_mbm_event(eventid: mevt->evtid) &&
1616	!strcmp(mevt->name, name))
1617	return mevt;
1618	}
1619
1620	return NULL;
1621	}
1622
1623	static int rdtgroup_modify_assign_state(char assign, struct* rdt_mon_domain *d,
1624	struct rdtgroup rdtgrp, struct* mon_evt *mevt)
1625	{
1626	int ret = `0`;
1627
1628	if (!assign \|\| strlen(assign) != `1`)
1629	return -EINVAL;
1630
1631	switch (*assign) {
1632	case `'e'`:
1633	ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt);
1634	break;
1635	case `'_'`:
1636	rdtgroup_unassign_cntr_event(d, rdtgrp, mevt);
1637	break;
1638	default:
1639	ret = -EINVAL;
1640	break;
1641	}
1642
1643	return ret;
1644	}
1645
1646	static int resctrl_parse_mbm_assignment(struct rdt_resource r, struct* rdtgroup *rdtgrp,
1647	char event, char* *tok)
1648	{
1649	struct rdt_mon_domain *d;
1650	unsigned long dom_id = `0`;
1651	char dom_str, id_str;
1652	struct mon_evt *mevt;
1653	int ret;
1654
1655	mevt = mbm_get_mon_event_by_name(r, name: event);
1656	if (!mevt) {
1657	rdt_last_cmd_printf(fmt: "Invalid event %s\n", event);
1658	return -ENOENT;
1659	}
1660
1661	next:
1662	if (!tok \|\| tok[`0`] == `'\0'`)
1663	return `0`;
1664
1665	/ Start processing the strings for each domain /
1666	dom_str = strim(strsep(&tok, ";"));
1667
1668	id_str = strsep(&dom_str, "=");
1669
1670	/ Check for domain id '' which means all domains /*
1671	if (id_str && id_str == `''`) {
1672	ret = rdtgroup_modify_assign_state(assign: dom_str, NULL, rdtgrp, mevt);
1673	if (ret)
1674	rdt_last_cmd_printf(fmt: "Assign operation '%s:*=%s' failed\n",
1675	event, dom_str);
1676	return ret;
1677	} else if (!id_str \|\| kstrtoul(s: id_str, base: `10`, res: &dom_id)) {
1678	rdt_last_cmd_puts(s: "Missing domain id\n");
1679	return -EINVAL;
1680	}
1681
1682	/ Verify if the dom_id is valid /
1683	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1684	if (d->hdr.id == dom_id) {
1685	ret = rdtgroup_modify_assign_state(assign: dom_str, d, rdtgrp, mevt);
1686	if (ret) {
1687	rdt_last_cmd_printf(fmt: "Assign operation '%s:%ld=%s' failed\n",
1688	event, dom_id, dom_str);
1689	return ret;
1690	}
1691	goto next;
1692	}
1693	}
1694
1695	rdt_last_cmd_printf(fmt: "Invalid domain id %ld\n", dom_id);
1696	return -EINVAL;
1697	}
1698
1699	ssize_t mbm_L3_assignments_write(struct kernfs_open_file of, char* *buf,
1700	size_t nbytes, loff_t off)
1701	{
1702	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
1703	struct rdtgroup *rdtgrp;
1704	char token, event;
1705	int ret = `0`;
1706
1707	/ Valid input requires a trailing newline /
1708	if (nbytes == `0` \|\| buf[nbytes - `1`] != `'\n'`)
1709	return -EINVAL;
1710
1711	buf[nbytes - `1`] = `'\0'`;
1712
1713	rdtgrp = rdtgroup_kn_lock_live(kn: of->kn);
1714	if (!rdtgrp) {
1715	rdtgroup_kn_unlock(kn: of->kn);
1716	return -ENOENT;
1717	}
1718	rdt_last_cmd_clear();
1719
1720	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
1721	rdt_last_cmd_puts(s: "mbm_event mode is not enabled\n");
1722	rdtgroup_kn_unlock(kn: of->kn);
1723	return -EINVAL;
1724	}
1725
1726	while ((token = strsep(&buf, "\n")) != NULL) {
1727	/*
1728	* The write command follows the following format:
1729	* "<Event>:<Domain ID>=<Assignment state>"
1730	* Extract the event name first.
1731	*/
1732	event = strsep(&token, ":");
1733
1734	ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, tok: token);
1735	if (ret)
1736	break;
1737	}
1738
1739	rdtgroup_kn_unlock(kn: of->kn);
1740
1741	return ret ?: nbytes;
1742	}
1743
1744	/**
1745	* resctrl_mon_resource_init() - Initialise global monitoring structures.
1746	*
1747	* Allocate and initialise global monitor resources that do not belong to a
1748	* specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
1749	* Called once during boot after the struct rdt_resource's have been configured
1750	* but before the filesystem is mounted.
1751	* Resctrl's cpuhp callbacks may be called before this point to bring a domain
1752	* online.
1753	*
1754	* Returns 0 for success, or -ENOMEM.
1755	*/
1756	int resctrl_mon_resource_init(void)
1757	{
1758	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
1759	int ret;
1760
1761	if (!r->mon_capable)
1762	return `0`;
1763
1764	ret = dom_data_init(r);
1765	if (ret)
1766	return ret;
1767
1768	if (resctrl_arch_is_evt_configurable(evt: QOS_L3_MBM_TOTAL_EVENT_ID)) {
1769	mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true;
1770	resctrl_file_fflags_init(config: "mbm_total_bytes_config",
1771	RFTYPE_MON_INFO \| RFTYPE_RES_CACHE);
1772	}
1773	if (resctrl_arch_is_evt_configurable(evt: QOS_L3_MBM_LOCAL_EVENT_ID)) {
1774	mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true;
1775	resctrl_file_fflags_init(config: "mbm_local_bytes_config",
1776	RFTYPE_MON_INFO \| RFTYPE_RES_CACHE);
1777	}
1778
1779	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_LOCAL_EVENT_ID))
1780	mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
1781	else if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_TOTAL_EVENT_ID))
1782	mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
1783
1784	if (r->mon.mbm_cntr_assignable) {
1785	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_TOTAL_EVENT_ID))
1786	mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
1787	if (resctrl_is_mon_event_enabled(eventid: QOS_L3_MBM_LOCAL_EVENT_ID))
1788	mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
1789	(READS_TO_LOCAL_MEM \|
1790	READS_TO_LOCAL_S_MEM \|
1791	NON_TEMP_WRITE_TO_LOCAL_MEM);
1792	r->mon.mbm_assign_on_mkdir = true;
1793	resctrl_file_fflags_init(config: "num_mbm_cntrs",
1794	RFTYPE_MON_INFO \| RFTYPE_RES_CACHE);
1795	resctrl_file_fflags_init(config: "available_mbm_cntrs",
1796	RFTYPE_MON_INFO \| RFTYPE_RES_CACHE);
1797	resctrl_file_fflags_init(config: "event_filter", RFTYPE_ASSIGN_CONFIG);
1798	resctrl_file_fflags_init(config: "mbm_assign_on_mkdir", RFTYPE_MON_INFO \|
1799	RFTYPE_RES_CACHE);
1800	resctrl_file_fflags_init(config: "mbm_L3_assignments", RFTYPE_MON_BASE);
1801	}
1802
1803	return `0`;
1804	}
1805
1806	void resctrl_mon_resource_exit(void)
1807	{
1808	struct rdt_resource *r = resctrl_arch_get_resource(l: RDT_RESOURCE_L3);
1809
1810	dom_data_exit(r);
1811	}
1812

source code of linux/fs/resctrl/monitor.c