1// SPDX-License-Identifier: GPL-2.0
2#include <trace/syscall.h>
3#include <trace/events/syscalls.h>
4#include <linux/kernel_stat.h>
5#include <linux/syscalls.h>
6#include <linux/slab.h>
7#include <linux/kernel.h>
8#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
9#include <linux/ftrace.h>
10#include <linux/perf_event.h>
11#include <linux/xarray.h>
12#include <asm/syscall.h>
13
14#include "trace_output.h"
15#include "trace.h"
16
17static DEFINE_MUTEX(syscall_trace_lock);
18
19static int syscall_enter_register(struct trace_event_call *event,
20 enum trace_reg type, void *data);
21static int syscall_exit_register(struct trace_event_call *event,
22 enum trace_reg type, void *data);
23
24static struct list_head *
25syscall_get_enter_fields(struct trace_event_call *call)
26{
27 struct syscall_metadata *entry = call->data;
28
29 return &entry->enter_fields;
30}
31
32extern struct syscall_metadata *__start_syscalls_metadata[];
33extern struct syscall_metadata *__stop_syscalls_metadata[];
34
35static DEFINE_XARRAY(syscalls_metadata_sparse);
36static struct syscall_metadata **syscalls_metadata;
37
38#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
39static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
40{
41 /*
42 * Only compare after the "sys" prefix. Archs that use
43 * syscall wrappers may have syscalls symbols aliases prefixed
44 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 * mismatch.
46 */
47 return !strcmp(sym + 3, name + 3);
48}
49#endif
50
51#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52/*
53 * Some architectures that allow for 32bit applications
54 * to run on a 64bit kernel, do not map the syscalls for
55 * the 32bit tasks the same as they do for 64bit tasks.
56 *
57 * *cough*x86*cough*
58 *
59 * In such a case, instead of reporting the wrong syscalls,
60 * simply ignore them.
61 *
62 * For an arch to ignore the compat syscalls it needs to
63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64 * define the function arch_trace_is_compat_syscall() to let
65 * the tracing system know that it should ignore it.
66 */
67static int
68trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69{
70 if (unlikely(arch_trace_is_compat_syscall(regs)))
71 return -1;
72
73 return syscall_get_nr(task, regs);
74}
75#else
76static inline int
77trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78{
79 return syscall_get_nr(task, regs);
80}
81#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82
83static __init struct syscall_metadata *
84find_syscall_meta(unsigned long syscall)
85{
86 struct syscall_metadata **start;
87 struct syscall_metadata **stop;
88 char str[KSYM_SYMBOL_LEN];
89
90
91 start = __start_syscalls_metadata;
92 stop = __stop_syscalls_metadata;
93 kallsyms_lookup(addr: syscall, NULL, NULL, NULL, namebuf: str);
94
95 if (arch_syscall_match_sym_name(sym: str, name: "sys_ni_syscall"))
96 return NULL;
97
98 for ( ; start < stop; start++) {
99 if ((*start)->name && arch_syscall_match_sym_name(sym: str, name: (*start)->name))
100 return *start;
101 }
102 return NULL;
103}
104
105static struct syscall_metadata *syscall_nr_to_meta(int nr)
106{
107 if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
108 return xa_load(&syscalls_metadata_sparse, index: (unsigned long)nr);
109
110 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
111 return NULL;
112
113 return syscalls_metadata[nr];
114}
115
116const char *get_syscall_name(int syscall)
117{
118 struct syscall_metadata *entry;
119
120 entry = syscall_nr_to_meta(nr: syscall);
121 if (!entry)
122 return NULL;
123
124 return entry->name;
125}
126
127/* Added to user strings or arrays when max limit is reached */
128#define EXTRA "..."
129
130static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
131 struct syscall_metadata *entry,
132 int *offset_p, int *len_p, unsigned char **ptr_p)
133{
134 unsigned char *ptr;
135 int offset = *offset_p;
136 int val;
137
138 /* This arg points to a user space string */
139 ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
140 val = *(int *)ptr;
141
142 /* The value is a dynamic string (len << 16 | offset) */
143 ptr = (void *)trace + (val & 0xffff);
144 *len_p = val >> 16;
145 offset += 4;
146
147 *ptr_p = ptr;
148 *offset_p = offset;
149}
150
151static enum print_line_t
152sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
153 struct trace_seq *s, struct trace_event *event)
154{
155 unsigned char *ptr;
156 int offset = 0;
157 int bits, len;
158 bool done = false;
159 static const struct trace_print_flags __flags[] =
160 {
161 { O_TMPFILE, "O_TMPFILE" },
162 { O_WRONLY, "O_WRONLY" },
163 { O_RDWR, "O_RDWR" },
164 { O_CREAT, "O_CREAT" },
165 { O_EXCL, "O_EXCL" },
166 { O_NOCTTY, "O_NOCTTY" },
167 { O_TRUNC, "O_TRUNC" },
168 { O_APPEND, "O_APPEND" },
169 { O_NONBLOCK, "O_NONBLOCK" },
170 { O_DSYNC, "O_DSYNC" },
171 { O_DIRECT, "O_DIRECT" },
172 { O_LARGEFILE, "O_LARGEFILE" },
173 { O_DIRECTORY, "O_DIRECTORY" },
174 { O_NOFOLLOW, "O_NOFOLLOW" },
175 { O_NOATIME, "O_NOATIME" },
176 { O_CLOEXEC, "O_CLOEXEC" },
177 { -1, NULL }
178 };
179
180 trace_seq_printf(s, fmt: "%s(", entry->name);
181
182 for (int i = 0; !done && i < entry->nb_args; i++) {
183
184 if (trace_seq_has_overflowed(s))
185 goto end;
186
187 if (i)
188 trace_seq_puts(s, str: ", ");
189
190 switch (i) {
191 case 2:
192 bits = trace->args[2];
193
194 trace_seq_puts(s, str: "flags: ");
195
196 /* No need to show mode when not creating the file */
197 if (!(bits & (O_CREAT|O_TMPFILE)))
198 done = true;
199
200 if (!(bits & O_ACCMODE)) {
201 if (!bits) {
202 trace_seq_puts(s, str: "O_RDONLY");
203 continue;
204 }
205 trace_seq_puts(s, str: "O_RDONLY|");
206 }
207
208 trace_print_flags_seq(p: s, delim: "|", flags: bits, flag_array: __flags);
209 /*
210 * trace_print_flags_seq() adds a '\0' to the
211 * buffer, but this needs to append more to the seq.
212 */
213 if (!trace_seq_has_overflowed(s))
214 trace_seq_pop(s);
215
216 continue;
217 case 3:
218 trace_seq_printf(s, fmt: "%s: 0%03o", entry->args[i],
219 (unsigned int)trace->args[i]);
220 continue;
221 }
222
223 trace_seq_printf(s, fmt: "%s: %lu", entry->args[i],
224 trace->args[i]);
225
226 if (!(BIT(i) & entry->user_mask))
227 continue;
228
229 get_dynamic_len_ptr(trace, entry, offset_p: &offset, len_p: &len, ptr_p: &ptr);
230 trace_seq_printf(s, fmt: " \"%.*s\"", len, ptr);
231 }
232
233 trace_seq_putc(s, c: ')');
234end:
235 trace_seq_putc(s, c: '\n');
236
237 return trace_handle_return(s);
238}
239
240static enum print_line_t
241print_syscall_enter(struct trace_iterator *iter, int flags,
242 struct trace_event *event)
243{
244 struct trace_array *tr = iter->tr;
245 struct trace_seq *s = &iter->seq;
246 struct trace_entry *ent = iter->ent;
247 struct syscall_trace_enter *trace;
248 struct syscall_metadata *entry;
249 int i, syscall, val, len;
250 unsigned char *ptr;
251 int offset = 0;
252
253 trace = (typeof(trace))ent;
254 syscall = trace->nr;
255 entry = syscall_nr_to_meta(nr: syscall);
256
257 if (!entry)
258 goto end;
259
260 if (entry->enter_event->event.type != ent->type) {
261 WARN_ON_ONCE(1);
262 goto end;
263 }
264
265 switch (entry->syscall_nr) {
266 case __NR_openat:
267 if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
268 return sys_enter_openat_print(trace, entry, s, event);
269 break;
270 default:
271 break;
272 }
273
274 trace_seq_printf(s, fmt: "%s(", entry->name);
275
276 for (i = 0; i < entry->nb_args; i++) {
277 bool printable = false;
278 char *str;
279
280 if (trace_seq_has_overflowed(s))
281 goto end;
282
283 if (i)
284 trace_seq_puts(s, str: ", ");
285
286 /* parameter types */
287 if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
288 trace_seq_printf(s, fmt: "%s ", entry->types[i]);
289
290 /* parameter values */
291 if (trace->args[i] < 10)
292 trace_seq_printf(s, fmt: "%s: %lu", entry->args[i],
293 trace->args[i]);
294 else
295 trace_seq_printf(s, fmt: "%s: 0x%lx", entry->args[i],
296 trace->args[i]);
297
298 if (!(BIT(i) & entry->user_mask))
299 continue;
300
301 get_dynamic_len_ptr(trace, entry, offset_p: &offset, len_p: &len, ptr_p: &ptr);
302
303 if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
304 trace_seq_printf(s, fmt: " \"%.*s\"", len, ptr);
305 continue;
306 }
307
308 val = trace->args[entry->user_arg_size];
309
310 str = ptr;
311 trace_seq_puts(s, str: " (");
312 for (int x = 0; x < len; x++, ptr++) {
313 if (isascii(*ptr) && isprint(*ptr))
314 printable = true;
315 if (x)
316 trace_seq_putc(s, c: ':');
317 trace_seq_printf(s, fmt: "%02x", *ptr);
318 }
319 if (len < val)
320 trace_seq_printf(s, fmt: ", %s", EXTRA);
321
322 trace_seq_putc(s, c: ')');
323
324 /* If nothing is printable, don't bother printing anything */
325 if (!printable)
326 continue;
327
328 trace_seq_puts(s, str: " \"");
329 for (int x = 0; x < len; x++) {
330 if (isascii(str[x]) && isprint(str[x]))
331 trace_seq_putc(s, c: str[x]);
332 else
333 trace_seq_putc(s, c: '.');
334 }
335 if (len < val)
336 trace_seq_printf(s, fmt: "\"%s", EXTRA);
337 else
338 trace_seq_putc(s, c: '"');
339 }
340
341 trace_seq_putc(s, c: ')');
342end:
343 trace_seq_putc(s, c: '\n');
344
345 return trace_handle_return(s);
346}
347
348static enum print_line_t
349print_syscall_exit(struct trace_iterator *iter, int flags,
350 struct trace_event *event)
351{
352 struct trace_seq *s = &iter->seq;
353 struct trace_entry *ent = iter->ent;
354 struct syscall_trace_exit *trace;
355 int syscall;
356 struct syscall_metadata *entry;
357
358 trace = (typeof(trace))ent;
359 syscall = trace->nr;
360 entry = syscall_nr_to_meta(nr: syscall);
361
362 if (!entry) {
363 trace_seq_putc(s, c: '\n');
364 goto out;
365 }
366
367 if (entry->exit_event->event.type != ent->type) {
368 WARN_ON_ONCE(1);
369 return TRACE_TYPE_UNHANDLED;
370 }
371
372 trace_seq_printf(s, fmt: "%s -> 0x%lx\n", entry->name,
373 trace->ret);
374
375 out:
376 return trace_handle_return(s);
377}
378
379#define SYSCALL_FIELD(_type, _name) { \
380 .type = #_type, .name = #_name, \
381 .size = sizeof(_type), .align = __alignof__(_type), \
382 .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
383
384/* When len=0, we just calculate the needed length */
385#define LEN_OR_ZERO (len ? len - pos : 0)
386
387static int __init
388sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
389{
390 int pos = 0;
391
392 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
393 fmt: "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
394 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
395 fmt: " ((unsigned long)(REC->dfd)),");
396 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
397 fmt: " ((unsigned long)(REC->filename)),");
398 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
399 fmt: " __get_str(__filename_val),");
400 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
401 fmt: " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
402 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
403 fmt: " REC->flags ? __print_flags(REC->flags, \"|\", ");
404 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
405 fmt: "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
406 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
407 fmt: "{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
408 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
409 fmt: "{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
410 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
411 fmt: "{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
412 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
413 fmt: "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
414 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
415 fmt: "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
416 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
417 fmt: "{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
418 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
419 fmt: "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
420 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
421 fmt: "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
422 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
423 fmt: "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
424 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
425 fmt: "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
426 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
427 fmt: "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
428 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
429 fmt: "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
430 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
431 fmt: "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
432 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
433 fmt: "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);
434
435 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
436 fmt: " ((unsigned long)(REC->mode))");
437 return pos;
438}
439
440static int __init
441__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
442{
443 bool is_string = entry->user_arg_is_str;
444 int i;
445 int pos = 0;
446
447 switch (entry->syscall_nr) {
448 case __NR_openat:
449 return sys_enter_openat_print_fmt(entry, buf, len);
450 default:
451 break;
452 }
453
454 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: "\"");
455 for (i = 0; i < entry->nb_args; i++) {
456 if (i)
457 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: ", ");
458 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: "%s: 0x%%0%zulx",
459 entry->args[i], sizeof(unsigned long));
460
461 if (!(BIT(i) & entry->user_mask))
462 continue;
463
464 /* Add the format for the user space string or array */
465 if (entry->user_arg_size < 0 || is_string)
466 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: " \\\"%%s\\\"");
467 else
468 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: " (%%s)");
469 }
470 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: "\"");
471
472 for (i = 0; i < entry->nb_args; i++) {
473 pos += snprintf(buf: buf + pos, LEN_OR_ZERO,
474 fmt: ", ((unsigned long)(REC->%s))", entry->args[i]);
475 if (!(BIT(i) & entry->user_mask))
476 continue;
477 /* The user space data for arg has name __<arg>_val */
478 if (entry->user_arg_size < 0 || is_string) {
479 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: ", __get_str(__%s_val)",
480 entry->args[i]);
481 } else {
482 pos += snprintf(buf: buf + pos, LEN_OR_ZERO, fmt: ", __print_dynamic_array(__%s_val, 1)",
483 entry->args[i]);
484 }
485 }
486
487#undef LEN_OR_ZERO
488
489 /* return the length of print_fmt */
490 return pos;
491}
492
493static int __init set_syscall_print_fmt(struct trace_event_call *call)
494{
495 char *print_fmt;
496 int len;
497 struct syscall_metadata *entry = call->data;
498
499 if (entry->enter_event != call) {
500 call->print_fmt = "\"0x%lx\", REC->ret";
501 return 0;
502 }
503
504 /* First: called with 0 length to calculate the needed length */
505 len = __set_enter_print_fmt(entry, NULL, len: 0);
506
507 print_fmt = kmalloc(len + 1, GFP_KERNEL);
508 if (!print_fmt)
509 return -ENOMEM;
510
511 /* Second: actually write the @print_fmt */
512 __set_enter_print_fmt(entry, buf: print_fmt, len: len + 1);
513 call->print_fmt = print_fmt;
514
515 return 0;
516}
517
518static void __init free_syscall_print_fmt(struct trace_event_call *call)
519{
520 struct syscall_metadata *entry = call->data;
521
522 if (entry->enter_event == call)
523 kfree(objp: call->print_fmt);
524}
525
526static int __init syscall_enter_define_fields(struct trace_event_call *call)
527{
528 struct syscall_trace_enter trace;
529 struct syscall_metadata *meta = call->data;
530 unsigned long mask;
531 char *arg;
532 int offset = offsetof(typeof(trace), args);
533 int ret = 0;
534 int len;
535 int i;
536
537 for (i = 0; i < meta->nb_args; i++) {
538 ret = trace_define_field(call, type: meta->types[i],
539 name: meta->args[i], offset,
540 size: sizeof(unsigned long), is_signed: 0,
541 filter_type: FILTER_OTHER);
542 if (ret)
543 break;
544 offset += sizeof(unsigned long);
545 }
546
547 if (ret || !meta->user_mask)
548 return ret;
549
550 mask = meta->user_mask;
551
552 while (mask) {
553 int idx = ffs(mask) - 1;
554 mask &= ~BIT(idx);
555
556 /*
557 * User space data is faulted into a temporary buffer and then
558 * added as a dynamic string or array to the end of the event.
559 * The user space data name for the arg pointer is
560 * "__<arg>_val".
561 */
562 len = strlen(meta->args[idx]) + sizeof("___val");
563 arg = kmalloc(len, GFP_KERNEL);
564 if (WARN_ON_ONCE(!arg)) {
565 meta->user_mask = 0;
566 return -ENOMEM;
567 }
568
569 snprintf(buf: arg, size: len, fmt: "__%s_val", meta->args[idx]);
570
571 ret = trace_define_field(call, type: "__data_loc char[]",
572 name: arg, offset, size: sizeof(int), is_signed: 0,
573 filter_type: FILTER_OTHER);
574 if (ret) {
575 kfree(objp: arg);
576 break;
577 }
578 offset += 4;
579 }
580 return ret;
581}
582
583/*
584 * Create a per CPU temporary buffer to copy user space pointers into.
585 *
586 * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
587 * (defined in kernel/trace/trace.h)
588
589 * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
590 * nul terminating byte and possibly appended EXTRA (4 bytes).
591 *
592 * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
593 * to copy memory from user space addresses into that will hold
594 * 3 args as only 3 args are allowed to be copied from system calls.
595 */
596#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
597#define SYSCALL_FAULT_MAX_CNT 3
598#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
599
600/* Use the tracing per CPU buffer infrastructure to copy from user space */
601struct syscall_user_buffer {
602 struct trace_user_buf_info buf;
603 struct rcu_head rcu;
604};
605
606static struct syscall_user_buffer *syscall_buffer;
607
608static int syscall_fault_buffer_enable(void)
609{
610 struct syscall_user_buffer *sbuf;
611 int ret;
612
613 lockdep_assert_held(&syscall_trace_lock);
614
615 if (syscall_buffer) {
616 trace_user_fault_get(tinfo: &syscall_buffer->buf);
617 return 0;
618 }
619
620 sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL);
621 if (!sbuf)
622 return -ENOMEM;
623
624 ret = trace_user_fault_init(tinfo: &sbuf->buf, SYSCALL_FAULT_BUF_SZ);
625 if (ret < 0) {
626 kfree(objp: sbuf);
627 return ret;
628 }
629
630 WRITE_ONCE(syscall_buffer, sbuf);
631
632 return 0;
633}
634
635static void rcu_free_syscall_buffer(struct rcu_head *rcu)
636{
637 struct syscall_user_buffer *sbuf =
638 container_of(rcu, struct syscall_user_buffer, rcu);
639
640 trace_user_fault_destroy(tinfo: &sbuf->buf);
641 kfree(objp: sbuf);
642}
643
644
645static void syscall_fault_buffer_disable(void)
646{
647 struct syscall_user_buffer *sbuf = syscall_buffer;
648
649 lockdep_assert_held(&syscall_trace_lock);
650
651 if (trace_user_fault_put(tinfo: &sbuf->buf))
652 return;
653
654 WRITE_ONCE(syscall_buffer, NULL);
655 call_rcu_tasks_trace(rhp: &sbuf->rcu, func: rcu_free_syscall_buffer);
656}
657
658struct syscall_args {
659 char *ptr_array[SYSCALL_FAULT_MAX_CNT];
660 int read[SYSCALL_FAULT_MAX_CNT];
661 int uargs;
662};
663
664static int syscall_copy_user(char *buf, const char __user *ptr,
665 size_t size, void *data)
666{
667 struct syscall_args *args = data;
668 int ret;
669
670 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
671 ptr = (char __user *)args->ptr_array[i];
672 ret = strncpy_from_user(dst: buf, src: ptr, count: size);
673 args->read[i] = ret;
674 }
675 return 0;
676}
677
678static int syscall_copy_user_array(char *buf, const char __user *ptr,
679 size_t size, void *data)
680{
681 struct syscall_args *args = data;
682 int ret;
683
684 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
685 ptr = (char __user *)args->ptr_array[i];
686 ret = __copy_from_user(to: buf, from: ptr, n: size);
687 args->read[i] = ret ? -1 : size;
688 }
689 return 0;
690}
691
692static char *sys_fault_user(unsigned int buf_size,
693 struct syscall_metadata *sys_data,
694 struct syscall_user_buffer *sbuf,
695 unsigned long *args,
696 unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
697{
698 trace_user_buf_copy syscall_copy = syscall_copy_user;
699 unsigned long mask = sys_data->user_mask;
700 unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
701 struct syscall_args sargs;
702 bool array = false;
703 char *buffer;
704 char *buf;
705 int ret;
706 int i = 0;
707
708 /* The extra is appended to the user data in the buffer */
709 BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
710 SYSCALL_FAULT_ARG_SZ);
711
712 /*
713 * If this system call event has a size argument, use
714 * it to define how much of user space memory to read,
715 * and read it as an array and not a string.
716 */
717 if (sys_data->user_arg_size >= 0) {
718 array = true;
719 size = args[sys_data->user_arg_size];
720 if (size > SYSCALL_FAULT_ARG_SZ - 1)
721 size = SYSCALL_FAULT_ARG_SZ - 1;
722 syscall_copy = syscall_copy_user_array;
723 }
724
725 while (mask) {
726 int idx = ffs(mask) - 1;
727 mask &= ~BIT(idx);
728
729 if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
730 break;
731
732 /* Get the pointer to user space memory to read */
733 sargs.ptr_array[i++] = (char *)args[idx];
734 }
735
736 sargs.uargs = i;
737
738 /* Clear the values that are not used */
739 for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
740 data_size[i] = -1; /* Denotes no pointer */
741 }
742
743 /* A zero size means do not even try */
744 if (!buf_size)
745 return NULL;
746
747 buffer = trace_user_fault_read(tinfo: &sbuf->buf, NULL, size,
748 copy_func: syscall_copy, data: &sargs);
749 if (!buffer)
750 return NULL;
751
752 buf = buffer;
753 for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
754
755 ret = sargs.read[i];
756 if (ret < 0)
757 continue;
758 buf[ret] = '\0';
759
760 /* For strings, replace any non-printable characters with '.' */
761 if (!array) {
762 for (int x = 0; x < ret; x++) {
763 if (!isprint(buf[x]))
764 buf[x] = '.';
765 }
766
767 size = min(buf_size, SYSCALL_FAULT_USER_MAX);
768
769 /*
770 * If the text was truncated due to our max limit,
771 * add "..." to the string.
772 */
773 if (ret > size) {
774 strscpy(buf + size, EXTRA, sizeof(EXTRA));
775 ret = size + sizeof(EXTRA);
776 } else {
777 buf[ret++] = '\0';
778 }
779 } else {
780 ret = min((unsigned int)ret, buf_size);
781 }
782 data_size[i] = ret;
783 }
784
785 return buffer;
786}
787
788static int
789syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
790 char **buffer, int *size, int *user_sizes, int *uargs,
791 int buf_size)
792{
793 struct syscall_user_buffer *sbuf;
794 int i;
795
796 /* If the syscall_buffer is NULL, tracing is being shutdown */
797 sbuf = READ_ONCE(syscall_buffer);
798 if (!sbuf)
799 return -1;
800
801 *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, data_size: user_sizes);
802 /*
803 * user_size is the amount of data to append.
804 * Need to add 4 for the meta field that points to
805 * the user memory at the end of the event and also
806 * stores its size.
807 */
808 for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
809 if (user_sizes[i] < 0)
810 break;
811 *size += user_sizes[i] + 4;
812 }
813 /* Save the number of user read arguments of this syscall */
814 *uargs = i;
815 return 0;
816}
817
818static void syscall_put_data(struct syscall_metadata *sys_data,
819 struct syscall_trace_enter *entry,
820 char *buffer, int size, int *user_sizes, int uargs)
821{
822 char *buf = buffer;
823 void *ptr;
824 int val;
825
826 /*
827 * Set the pointer to point to the meta data of the event
828 * that has information about the stored user space memory.
829 */
830 ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
831
832 /*
833 * The meta data will store the offset of the user data from
834 * the beginning of the event. That is after the static arguments
835 * and the meta data fields.
836 */
837 val = (ptr - (void *)entry) + 4 * uargs;
838
839 for (int i = 0; i < uargs; i++) {
840
841 if (i)
842 val += user_sizes[i - 1];
843
844 /* Store the offset and the size into the meta data */
845 *(int *)ptr = val | (user_sizes[i] << 16);
846
847 /* Skip the meta data */
848 ptr += 4;
849 }
850
851 for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
852 /* Nothing to do if the user space was empty or faulted */
853 if (!user_sizes[i])
854 continue;
855
856 memcpy(ptr, buf, user_sizes[i]);
857 ptr += user_sizes[i];
858 }
859}
860
861static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
862{
863 struct trace_array *tr = data;
864 struct trace_event_file *trace_file;
865 struct syscall_trace_enter *entry;
866 struct syscall_metadata *sys_data;
867 struct trace_event_buffer fbuffer;
868 unsigned long args[6];
869 char *user_ptr;
870 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
871 int syscall_nr;
872 int size = 0;
873 int uargs = 0;
874 bool mayfault;
875
876 /*
877 * Syscall probe called with preemption enabled, but the ring
878 * buffer and per-cpu data require preemption to be disabled.
879 */
880 might_fault();
881
882 syscall_nr = trace_get_syscall_nr(current, regs);
883 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
884 return;
885
886 trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
887 if (!trace_file)
888 return;
889
890 if (trace_trigger_soft_disabled(file: trace_file))
891 return;
892
893 sys_data = syscall_nr_to_meta(nr: syscall_nr);
894 if (!sys_data)
895 return;
896
897 /* Check if this syscall event faults in user space memory */
898 mayfault = sys_data->user_mask != 0;
899
900 guard(preempt_notrace)();
901
902 syscall_get_arguments(current, regs, args);
903
904 if (mayfault) {
905 if (syscall_get_data(sys_data, args, buffer: &user_ptr,
906 size: &size, user_sizes, uargs: &uargs, buf_size: tr->syscall_buf_sz) < 0)
907 return;
908 }
909
910 size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
911
912 entry = trace_event_buffer_reserve(fbuffer: &fbuffer, trace_file, len: size);
913 if (!entry)
914 return;
915
916 entry = ring_buffer_event_data(event: fbuffer.event);
917 entry->nr = syscall_nr;
918
919 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
920
921 if (mayfault)
922 syscall_put_data(sys_data, entry, buffer: user_ptr, size, user_sizes, uargs);
923
924 trace_event_buffer_commit(fbuffer: &fbuffer);
925}
926
927static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
928{
929 struct trace_array *tr = data;
930 struct trace_event_file *trace_file;
931 struct syscall_trace_exit *entry;
932 struct syscall_metadata *sys_data;
933 struct trace_event_buffer fbuffer;
934 int syscall_nr;
935
936 /*
937 * Syscall probe called with preemption enabled, but the ring
938 * buffer and per-cpu data require preemption to be disabled.
939 */
940 might_fault();
941 guard(preempt_notrace)();
942
943 syscall_nr = trace_get_syscall_nr(current, regs);
944 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
945 return;
946
947 trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
948 if (!trace_file)
949 return;
950
951 if (trace_trigger_soft_disabled(file: trace_file))
952 return;
953
954 sys_data = syscall_nr_to_meta(nr: syscall_nr);
955 if (!sys_data)
956 return;
957
958 entry = trace_event_buffer_reserve(fbuffer: &fbuffer, trace_file, len: sizeof(*entry));
959 if (!entry)
960 return;
961
962 entry = ring_buffer_event_data(event: fbuffer.event);
963 entry->nr = syscall_nr;
964 entry->ret = syscall_get_return_value(current, regs);
965
966 trace_event_buffer_commit(fbuffer: &fbuffer);
967}
968
969static int reg_event_syscall_enter(struct trace_event_file *file,
970 struct trace_event_call *call)
971{
972 struct syscall_metadata *sys_data = call->data;
973 struct trace_array *tr = file->tr;
974 int ret = 0;
975 int num;
976
977 num = sys_data->syscall_nr;
978 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
979 return -ENOSYS;
980 guard(mutex)(T: &syscall_trace_lock);
981 if (sys_data->user_mask) {
982 ret = syscall_fault_buffer_enable();
983 if (ret < 0)
984 return ret;
985 }
986 if (!tr->sys_refcount_enter) {
987 ret = register_trace_sys_enter(probe: ftrace_syscall_enter, data: tr);
988 if (ret < 0) {
989 if (sys_data->user_mask)
990 syscall_fault_buffer_disable();
991 return ret;
992 }
993 }
994 WRITE_ONCE(tr->enter_syscall_files[num], file);
995 tr->sys_refcount_enter++;
996 return 0;
997}
998
999static void unreg_event_syscall_enter(struct trace_event_file *file,
1000 struct trace_event_call *call)
1001{
1002 struct syscall_metadata *sys_data = call->data;
1003 struct trace_array *tr = file->tr;
1004 int num;
1005
1006 num = sys_data->syscall_nr;
1007 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1008 return;
1009 guard(mutex)(T: &syscall_trace_lock);
1010 tr->sys_refcount_enter--;
1011 WRITE_ONCE(tr->enter_syscall_files[num], NULL);
1012 if (!tr->sys_refcount_enter)
1013 unregister_trace_sys_enter(probe: ftrace_syscall_enter, data: tr);
1014 if (sys_data->user_mask)
1015 syscall_fault_buffer_disable();
1016}
1017
1018static int reg_event_syscall_exit(struct trace_event_file *file,
1019 struct trace_event_call *call)
1020{
1021 struct trace_array *tr = file->tr;
1022 int ret = 0;
1023 int num;
1024
1025 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1026 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1027 return -ENOSYS;
1028 mutex_lock(&syscall_trace_lock);
1029 if (!tr->sys_refcount_exit)
1030 ret = register_trace_sys_exit(probe: ftrace_syscall_exit, data: tr);
1031 if (!ret) {
1032 WRITE_ONCE(tr->exit_syscall_files[num], file);
1033 tr->sys_refcount_exit++;
1034 }
1035 mutex_unlock(lock: &syscall_trace_lock);
1036 return ret;
1037}
1038
1039static void unreg_event_syscall_exit(struct trace_event_file *file,
1040 struct trace_event_call *call)
1041{
1042 struct trace_array *tr = file->tr;
1043 int num;
1044
1045 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1046 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1047 return;
1048 mutex_lock(&syscall_trace_lock);
1049 tr->sys_refcount_exit--;
1050 WRITE_ONCE(tr->exit_syscall_files[num], NULL);
1051 if (!tr->sys_refcount_exit)
1052 unregister_trace_sys_exit(probe: ftrace_syscall_exit, data: tr);
1053 mutex_unlock(lock: &syscall_trace_lock);
1054}
1055
1056/*
1057 * For system calls that reference user space memory that can
1058 * be recorded into the event, set the system call meta data's user_mask
1059 * to the "args" index that points to the user space memory to retrieve.
1060 */
1061static void check_faultable_syscall(struct trace_event_call *call, int nr)
1062{
1063 struct syscall_metadata *sys_data = call->data;
1064 unsigned long mask;
1065
1066 /* Only work on entry */
1067 if (sys_data->enter_event != call)
1068 return;
1069
1070 sys_data->user_arg_size = -1;
1071
1072 switch (nr) {
1073 /* user arg 1 with size arg at 2 */
1074 case __NR_write:
1075#ifdef __NR_mq_timedsend
1076 case __NR_mq_timedsend:
1077#endif
1078 case __NR_pwrite64:
1079 sys_data->user_mask = BIT(1);
1080 sys_data->user_arg_size = 2;
1081 break;
1082 /* user arg 0 with size arg at 1 as string */
1083 case __NR_setdomainname:
1084 case __NR_sethostname:
1085 sys_data->user_mask = BIT(0);
1086 sys_data->user_arg_size = 1;
1087 sys_data->user_arg_is_str = 1;
1088 break;
1089#ifdef __NR_kexec_file_load
1090 /* user arg 4 with size arg at 3 as string */
1091 case __NR_kexec_file_load:
1092 sys_data->user_mask = BIT(4);
1093 sys_data->user_arg_size = 3;
1094 sys_data->user_arg_is_str = 1;
1095 break;
1096#endif
1097 /* user arg at position 0 */
1098#ifdef __NR_access
1099 case __NR_access:
1100#endif
1101 case __NR_acct:
1102 case __NR_chdir:
1103#ifdef __NR_chown
1104 case __NR_chown:
1105#endif
1106#ifdef __NR_chmod
1107 case __NR_chmod:
1108#endif
1109 case __NR_chroot:
1110#ifdef __NR_creat
1111 case __NR_creat:
1112#endif
1113 case __NR_delete_module:
1114 case __NR_execve:
1115 case __NR_fsopen:
1116#ifdef __NR_lchown
1117 case __NR_lchown:
1118#endif
1119#ifdef __NR_open
1120 case __NR_open:
1121#endif
1122 case __NR_memfd_create:
1123#ifdef __NR_mkdir
1124 case __NR_mkdir:
1125#endif
1126#ifdef __NR_mknod
1127 case __NR_mknod:
1128#endif
1129 case __NR_mq_open:
1130 case __NR_mq_unlink:
1131#ifdef __NR_readlink
1132 case __NR_readlink:
1133#endif
1134#ifdef __NR_rmdir
1135 case __NR_rmdir:
1136#endif
1137 case __NR_shmdt:
1138#ifdef __NR_statfs
1139 case __NR_statfs:
1140#endif
1141 case __NR_swapon:
1142 case __NR_swapoff:
1143#ifdef __NR_truncate
1144 case __NR_truncate:
1145#endif
1146#ifdef __NR_unlink
1147 case __NR_unlink:
1148#endif
1149 case __NR_umount2:
1150#ifdef __NR_utime
1151 case __NR_utime:
1152#endif
1153#ifdef __NR_utimes
1154 case __NR_utimes:
1155#endif
1156 sys_data->user_mask = BIT(0);
1157 break;
1158 /* user arg at position 1 */
1159 case __NR_execveat:
1160 case __NR_faccessat:
1161 case __NR_faccessat2:
1162 case __NR_finit_module:
1163 case __NR_fchmodat:
1164 case __NR_fchmodat2:
1165 case __NR_fchownat:
1166 case __NR_fgetxattr:
1167 case __NR_flistxattr:
1168 case __NR_fsetxattr:
1169 case __NR_fspick:
1170 case __NR_fremovexattr:
1171#ifdef __NR_futimesat
1172 case __NR_futimesat:
1173#endif
1174 case __NR_inotify_add_watch:
1175 case __NR_mkdirat:
1176 case __NR_mknodat:
1177 case __NR_mount_setattr:
1178 case __NR_name_to_handle_at:
1179#ifdef __NR_newfstatat
1180 case __NR_newfstatat:
1181#endif
1182 case __NR_openat:
1183 case __NR_openat2:
1184 case __NR_open_tree:
1185 case __NR_open_tree_attr:
1186 case __NR_readlinkat:
1187 case __NR_quotactl:
1188 case __NR_syslog:
1189 case __NR_statx:
1190 case __NR_unlinkat:
1191#ifdef __NR_utimensat
1192 case __NR_utimensat:
1193#endif
1194 sys_data->user_mask = BIT(1);
1195 break;
1196 /* user arg at position 2 */
1197 case __NR_init_module:
1198 case __NR_fsconfig:
1199 sys_data->user_mask = BIT(2);
1200 break;
1201 /* user arg at position 4 */
1202 case __NR_fanotify_mark:
1203 sys_data->user_mask = BIT(4);
1204 break;
1205 /* 2 user args, 0 and 1 */
1206 case __NR_add_key:
1207 case __NR_getxattr:
1208 case __NR_lgetxattr:
1209 case __NR_lremovexattr:
1210#ifdef __NR_link
1211 case __NR_link:
1212#endif
1213 case __NR_listxattr:
1214 case __NR_llistxattr:
1215 case __NR_lsetxattr:
1216 case __NR_pivot_root:
1217 case __NR_removexattr:
1218#ifdef __NR_rename
1219 case __NR_rename:
1220#endif
1221 case __NR_request_key:
1222 case __NR_setxattr:
1223#ifdef __NR_symlink
1224 case __NR_symlink:
1225#endif
1226 sys_data->user_mask = BIT(0) | BIT(1);
1227 break;
1228 /* 2 user args, 0 and 2 */
1229 case __NR_symlinkat:
1230 sys_data->user_mask = BIT(0) | BIT(2);
1231 break;
1232 /* 2 user args, 1 and 3 */
1233 case __NR_getxattrat:
1234 case __NR_linkat:
1235 case __NR_listxattrat:
1236 case __NR_move_mount:
1237#ifdef __NR_renameat
1238 case __NR_renameat:
1239#endif
1240 case __NR_renameat2:
1241 case __NR_removexattrat:
1242 case __NR_setxattrat:
1243 sys_data->user_mask = BIT(1) | BIT(3);
1244 break;
1245 case __NR_mount: /* Just dev_name and dir_name, TODO add type */
1246 sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
1247 break;
1248 default:
1249 sys_data->user_mask = 0;
1250 return;
1251 }
1252
1253 if (sys_data->user_arg_size < 0)
1254 return;
1255
1256 /*
1257 * The user_arg_size can only be used when the system call
1258 * is reading only a single address from user space.
1259 */
1260 mask = sys_data->user_mask;
1261 if (WARN_ON(mask & (mask - 1)))
1262 sys_data->user_arg_size = -1;
1263}
1264
1265static int __init init_syscall_trace(struct trace_event_call *call)
1266{
1267 int id;
1268 int num;
1269
1270 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1271 if (num < 0 || num >= NR_syscalls) {
1272 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
1273 ((struct syscall_metadata *)call->data)->name);
1274 return -ENOSYS;
1275 }
1276
1277 check_faultable_syscall(call, nr: num);
1278
1279 if (set_syscall_print_fmt(call) < 0)
1280 return -ENOMEM;
1281
1282 id = trace_event_raw_init(call);
1283
1284 if (id < 0) {
1285 free_syscall_print_fmt(call);
1286 return id;
1287 }
1288
1289 return id;
1290}
1291
1292static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
1293 SYSCALL_FIELD(int, __syscall_nr),
1294 { .type = TRACE_FUNCTION_TYPE,
1295 .define_fields = syscall_enter_define_fields },
1296 {}
1297};
1298
1299struct trace_event_functions enter_syscall_print_funcs = {
1300 .trace = print_syscall_enter,
1301};
1302
1303struct trace_event_functions exit_syscall_print_funcs = {
1304 .trace = print_syscall_exit,
1305};
1306
1307struct trace_event_class __refdata event_class_syscall_enter = {
1308 .system = "syscalls",
1309 .reg = syscall_enter_register,
1310 .fields_array = syscall_enter_fields_array,
1311 .get_fields = syscall_get_enter_fields,
1312 .raw_init = init_syscall_trace,
1313};
1314
1315struct trace_event_class __refdata event_class_syscall_exit = {
1316 .system = "syscalls",
1317 .reg = syscall_exit_register,
1318 .fields_array = (struct trace_event_fields[]){
1319 SYSCALL_FIELD(int, __syscall_nr),
1320 SYSCALL_FIELD(long, ret),
1321 {}
1322 },
1323 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
1324 .raw_init = init_syscall_trace,
1325};
1326
1327unsigned long __init __weak arch_syscall_addr(int nr)
1328{
1329 return (unsigned long)sys_call_table[nr];
1330}
1331
1332void __init init_ftrace_syscalls(void)
1333{
1334 struct syscall_metadata *meta;
1335 unsigned long addr;
1336 int i;
1337 void *ret;
1338
1339 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1340 syscalls_metadata = kcalloc(NR_syscalls,
1341 sizeof(*syscalls_metadata),
1342 GFP_KERNEL);
1343 if (!syscalls_metadata) {
1344 WARN_ON(1);
1345 return;
1346 }
1347 }
1348
1349 for (i = 0; i < NR_syscalls; i++) {
1350 addr = arch_syscall_addr(nr: i);
1351 meta = find_syscall_meta(syscall: addr);
1352 if (!meta)
1353 continue;
1354
1355 meta->syscall_nr = i;
1356
1357 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1358 syscalls_metadata[i] = meta;
1359 } else {
1360 ret = xa_store(&syscalls_metadata_sparse, index: i, entry: meta,
1361 GFP_KERNEL);
1362 WARN(xa_is_err(ret),
1363 "Syscall memory allocation failed\n");
1364 }
1365
1366 }
1367}
1368
1369#ifdef CONFIG_PERF_EVENTS
1370
1371static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
1372static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
1373static int sys_perf_refcount_enter;
1374static int sys_perf_refcount_exit;
1375
1376static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
1377 struct syscall_metadata *sys_data,
1378 struct syscall_trace_enter *rec)
1379{
1380 struct syscall_tp_t {
1381 struct trace_entry ent;
1382 int syscall_nr;
1383 unsigned long args[SYSCALL_DEFINE_MAXARGS];
1384 } __aligned(8) param;
1385 int i;
1386
1387 BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
1388
1389 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
1390 perf_fetch_caller_regs(regs);
1391 *(struct pt_regs **)&param = regs;
1392 param.syscall_nr = rec->nr;
1393 for (i = 0; i < sys_data->nb_args; i++)
1394 param.args[i] = rec->args[i];
1395 return trace_call_bpf(call, ctx: &param);
1396}
1397
1398static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
1399{
1400 struct syscall_metadata *sys_data;
1401 struct syscall_trace_enter *rec;
1402 struct pt_regs *fake_regs;
1403 struct hlist_head *head;
1404 unsigned long args[6];
1405 bool valid_prog_array;
1406 bool mayfault;
1407 char *user_ptr;
1408 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
1409 int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
1410 int syscall_nr;
1411 int rctx;
1412 int size = 0;
1413 int uargs = 0;
1414
1415 /*
1416 * Syscall probe called with preemption enabled, but the ring
1417 * buffer and per-cpu data require preemption to be disabled.
1418 */
1419 might_fault();
1420 guard(preempt_notrace)();
1421
1422 syscall_nr = trace_get_syscall_nr(current, regs);
1423 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1424 return;
1425 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
1426 return;
1427
1428 sys_data = syscall_nr_to_meta(nr: syscall_nr);
1429 if (!sys_data)
1430 return;
1431
1432 syscall_get_arguments(current, regs, args);
1433
1434 /* Check if this syscall event faults in user space memory */
1435 mayfault = sys_data->user_mask != 0;
1436
1437 if (mayfault) {
1438 if (syscall_get_data(sys_data, args, buffer: &user_ptr,
1439 size: &size, user_sizes, uargs: &uargs, buf_size) < 0)
1440 return;
1441 }
1442
1443 head = this_cpu_ptr(sys_data->enter_event->perf_events);
1444 valid_prog_array = bpf_prog_array_valid(call: sys_data->enter_event);
1445 if (!valid_prog_array && hlist_empty(h: head))
1446 return;
1447
1448 /* get the size after alignment with the u32 buffer size field */
1449 size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
1450 size = ALIGN(size + sizeof(u32), sizeof(u64));
1451 size -= sizeof(u32);
1452
1453 rec = perf_trace_buf_alloc(size, regs: &fake_regs, rctxp: &rctx);
1454 if (!rec)
1455 return;
1456
1457 rec->nr = syscall_nr;
1458 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
1459
1460 if (mayfault)
1461 syscall_put_data(sys_data, entry: rec, buffer: user_ptr, size, user_sizes, uargs);
1462
1463 if ((valid_prog_array &&
1464 !perf_call_bpf_enter(call: sys_data->enter_event, regs: fake_regs, sys_data, rec)) ||
1465 hlist_empty(h: head)) {
1466 perf_swevent_put_recursion_context(rctx);
1467 return;
1468 }
1469
1470 perf_trace_buf_submit(raw_data: rec, size, rctx,
1471 type: sys_data->enter_event->event.type, count: 1, regs,
1472 head, NULL);
1473}
1474
1475static int perf_sysenter_enable(struct trace_event_call *call)
1476{
1477 struct syscall_metadata *sys_data = call->data;
1478 int num;
1479 int ret;
1480
1481 num = sys_data->syscall_nr;
1482
1483 guard(mutex)(T: &syscall_trace_lock);
1484 if (sys_data->user_mask) {
1485 ret = syscall_fault_buffer_enable();
1486 if (ret < 0)
1487 return ret;
1488 }
1489 if (!sys_perf_refcount_enter) {
1490 ret = register_trace_sys_enter(probe: perf_syscall_enter, NULL);
1491 if (ret) {
1492 pr_info("event trace: Could not activate syscall entry trace point");
1493 if (sys_data->user_mask)
1494 syscall_fault_buffer_disable();
1495 return ret;
1496 }
1497 }
1498 set_bit(nr: num, addr: enabled_perf_enter_syscalls);
1499 sys_perf_refcount_enter++;
1500 return 0;
1501}
1502
1503static void perf_sysenter_disable(struct trace_event_call *call)
1504{
1505 struct syscall_metadata *sys_data = call->data;
1506 int num;
1507
1508 num = sys_data->syscall_nr;
1509
1510 guard(mutex)(T: &syscall_trace_lock);
1511 sys_perf_refcount_enter--;
1512 clear_bit(nr: num, addr: enabled_perf_enter_syscalls);
1513 if (!sys_perf_refcount_enter)
1514 unregister_trace_sys_enter(probe: perf_syscall_enter, NULL);
1515 if (sys_data->user_mask)
1516 syscall_fault_buffer_disable();
1517}
1518
1519static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
1520 struct syscall_trace_exit *rec)
1521{
1522 struct syscall_tp_t {
1523 struct trace_entry ent;
1524 int syscall_nr;
1525 unsigned long ret;
1526 } __aligned(8) param;
1527
1528 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
1529 perf_fetch_caller_regs(regs);
1530 *(struct pt_regs **)&param = regs;
1531 param.syscall_nr = rec->nr;
1532 param.ret = rec->ret;
1533 return trace_call_bpf(call, ctx: &param);
1534}
1535
1536static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
1537{
1538 struct syscall_metadata *sys_data;
1539 struct syscall_trace_exit *rec;
1540 struct pt_regs *fake_regs;
1541 struct hlist_head *head;
1542 bool valid_prog_array;
1543 int syscall_nr;
1544 int rctx;
1545 int size;
1546
1547 /*
1548 * Syscall probe called with preemption enabled, but the ring
1549 * buffer and per-cpu data require preemption to be disabled.
1550 */
1551 might_fault();
1552 guard(preempt_notrace)();
1553
1554 syscall_nr = trace_get_syscall_nr(current, regs);
1555 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1556 return;
1557 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
1558 return;
1559
1560 sys_data = syscall_nr_to_meta(nr: syscall_nr);
1561 if (!sys_data)
1562 return;
1563
1564 head = this_cpu_ptr(sys_data->exit_event->perf_events);
1565 valid_prog_array = bpf_prog_array_valid(call: sys_data->exit_event);
1566 if (!valid_prog_array && hlist_empty(h: head))
1567 return;
1568
1569 /* We can probably do that at build time */
1570 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
1571 size -= sizeof(u32);
1572
1573 rec = perf_trace_buf_alloc(size, regs: &fake_regs, rctxp: &rctx);
1574 if (!rec)
1575 return;
1576
1577 rec->nr = syscall_nr;
1578 rec->ret = syscall_get_return_value(current, regs);
1579
1580 if ((valid_prog_array &&
1581 !perf_call_bpf_exit(call: sys_data->exit_event, regs: fake_regs, rec)) ||
1582 hlist_empty(h: head)) {
1583 perf_swevent_put_recursion_context(rctx);
1584 return;
1585 }
1586
1587 perf_trace_buf_submit(raw_data: rec, size, rctx, type: sys_data->exit_event->event.type,
1588 count: 1, regs, head, NULL);
1589}
1590
1591static int perf_sysexit_enable(struct trace_event_call *call)
1592{
1593 int num;
1594
1595 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1596
1597 guard(mutex)(T: &syscall_trace_lock);
1598 if (!sys_perf_refcount_exit) {
1599 int ret = register_trace_sys_exit(probe: perf_syscall_exit, NULL);
1600 if (ret) {
1601 pr_info("event trace: Could not activate syscall exit trace point");
1602 return ret;
1603 }
1604 }
1605 set_bit(nr: num, addr: enabled_perf_exit_syscalls);
1606 sys_perf_refcount_exit++;
1607 return 0;
1608}
1609
1610static void perf_sysexit_disable(struct trace_event_call *call)
1611{
1612 int num;
1613
1614 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1615
1616 guard(mutex)(T: &syscall_trace_lock);
1617 sys_perf_refcount_exit--;
1618 clear_bit(nr: num, addr: enabled_perf_exit_syscalls);
1619 if (!sys_perf_refcount_exit)
1620 unregister_trace_sys_exit(probe: perf_syscall_exit, NULL);
1621}
1622
1623#endif /* CONFIG_PERF_EVENTS */
1624
1625static int syscall_enter_register(struct trace_event_call *event,
1626 enum trace_reg type, void *data)
1627{
1628 struct trace_event_file *file = data;
1629
1630 switch (type) {
1631 case TRACE_REG_REGISTER:
1632 return reg_event_syscall_enter(file, call: event);
1633 case TRACE_REG_UNREGISTER:
1634 unreg_event_syscall_enter(file, call: event);
1635 return 0;
1636
1637#ifdef CONFIG_PERF_EVENTS
1638 case TRACE_REG_PERF_REGISTER:
1639 return perf_sysenter_enable(call: event);
1640 case TRACE_REG_PERF_UNREGISTER:
1641 perf_sysenter_disable(call: event);
1642 return 0;
1643 case TRACE_REG_PERF_OPEN:
1644 case TRACE_REG_PERF_CLOSE:
1645 case TRACE_REG_PERF_ADD:
1646 case TRACE_REG_PERF_DEL:
1647 return 0;
1648#endif
1649 }
1650 return 0;
1651}
1652
1653static int syscall_exit_register(struct trace_event_call *event,
1654 enum trace_reg type, void *data)
1655{
1656 struct trace_event_file *file = data;
1657
1658 switch (type) {
1659 case TRACE_REG_REGISTER:
1660 return reg_event_syscall_exit(file, call: event);
1661 case TRACE_REG_UNREGISTER:
1662 unreg_event_syscall_exit(file, call: event);
1663 return 0;
1664
1665#ifdef CONFIG_PERF_EVENTS
1666 case TRACE_REG_PERF_REGISTER:
1667 return perf_sysexit_enable(call: event);
1668 case TRACE_REG_PERF_UNREGISTER:
1669 perf_sysexit_disable(call: event);
1670 return 0;
1671 case TRACE_REG_PERF_OPEN:
1672 case TRACE_REG_PERF_CLOSE:
1673 case TRACE_REG_PERF_ADD:
1674 case TRACE_REG_PERF_DEL:
1675 return 0;
1676#endif
1677 }
1678 return 0;
1679}
1680

source code of linux/kernel/trace/trace_syscalls.c