Skip to content

Commit 0dea116

Browse files
kiryltorvalds
authored andcommitted
cgroup: implement eventfd-based generic API for notifications
This patchset introduces eventfd-based API for notifications in cgroups and implements memory notifications on top of it. It uses statistics in memory controler to track memory usage. Output of time(1) on building kernel on tmpfs: Root cgroup before changes: make -j2 506.37 user 60.93s system 193% cpu 4:52.77 total Non-root cgroup before changes: make -j2 507.14 user 62.66s system 193% cpu 4:54.74 total Root cgroup after changes (0 thresholds): make -j2 507.13 user 62.20s system 193% cpu 4:53.55 total Non-root cgroup after changes (0 thresholds): make -j2 507.70 user 64.20s system 193% cpu 4:55.70 total Root cgroup after changes (1 thresholds, never crossed): make -j2 506.97 user 62.20s system 193% cpu 4:53.90 total Non-root cgroup after changes (1 thresholds, never crossed): make -j2 507.55 user 64.08s system 193% cpu 4:55.63 total This patch: Introduce the write-only file "cgroup.event_control" in every cgroup. To register new notification handler you need: - create an eventfd; - open a control file to be monitored. Callbacks register_event() and unregister_event() must be defined for the control file; - write "<event_fd> <control_fd> <args>" to cgroup.event_control. Interpretation of args is defined by control file implementation; eventfd will be woken up by control file implementation or when the cgroup is removed. To unregister notification handler just close eventfd. If you need notification functionality for a control file you have to implement callbacks register_event() and unregister_event() in the struct cftype. [kamezawa.hiroyu@jp.fujitsu.com: Kconfig fix] Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: Dan Malek <dan@embeddedalley.com> Cc: Vladislav Buzov <vbuzov@embeddedalley.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Alexander Shishkin <virtuoso@slind.org> Cc: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 483c30b commit 0dea116

4 files changed

Lines changed: 272 additions & 1 deletion

File tree

Documentation/cgroups/cgroups.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ CONTENTS:
2323
2.1 Basic Usage
2424
2.2 Attaching processes
2525
2.3 Mounting hierarchies by name
26+
2.4 Notification API
2627
3. Kernel API
2728
3.1 Overview
2829
3.2 Synchronization
@@ -435,6 +436,25 @@ you give a subsystem a name.
435436
The name of the subsystem appears as part of the hierarchy description
436437
in /proc/mounts and /proc/<pid>/cgroups.
437438

439+
2.4 Notification API
440+
--------------------
441+
442+
There is mechanism which allows to get notifications about changing
443+
status of a cgroup.
444+
445+
To register new notification handler you need:
446+
- create a file descriptor for event notification using eventfd(2);
447+
- open a control file to be monitored (e.g. memory.usage_in_bytes);
448+
- write "<event_fd> <control_fd> <args>" to cgroup.event_control.
449+
Interpretation of args is defined by control file implementation;
450+
451+
eventfd will be woken up by control file implementation or when the
452+
cgroup is removed.
453+
454+
To unregister notification handler just close eventfd.
455+
456+
NOTE: Support of notifications should be implemented for the control
457+
file. See documentation for the subsystem.
438458

439459
3. Kernel API
440460
=============

include/linux/cgroup.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,10 @@ struct cgroup {
235235

236236
/* For RCU-protected deletion */
237237
struct rcu_head rcu_head;
238+
239+
/* List of events which userspace want to recieve */
240+
struct list_head event_list;
241+
spinlock_t event_list_lock;
238242
};
239243

240244
/*
@@ -378,6 +382,26 @@ struct cftype {
378382
int (*trigger)(struct cgroup *cgrp, unsigned int event);
379383

380384
int (*release)(struct inode *inode, struct file *file);
385+
386+
/*
387+
* register_event() callback will be used to add new userspace
388+
* waiter for changes related to the cftype. Implement it if
389+
* you want to provide this functionality. Use eventfd_signal()
390+
* on eventfd to send notification to userspace.
391+
*/
392+
int (*register_event)(struct cgroup *cgrp, struct cftype *cft,
393+
struct eventfd_ctx *eventfd, const char *args);
394+
/*
395+
* unregister_event() callback will be called when userspace
396+
* closes the eventfd or on cgroup removing.
397+
* This callback must be implemented, if you want provide
398+
* notification functionality.
399+
*
400+
* Be careful. It can be called after destroy(), so you have
401+
* to keep all nesessary data, until all events are removed.
402+
*/
403+
int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft,
404+
struct eventfd_ctx *eventfd);
381405
};
382406

383407
struct cgroup_scanner {

init/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK
463463

464464
menuconfig CGROUPS
465465
boolean "Control Group support"
466+
depends on EVENTFD
466467
help
467468
This option adds support for grouping sets of processes together, for
468469
use with process control subsystems such as Cpusets, CFS, memory

kernel/cgroup.c

Lines changed: 227 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
* Based originally on the cpuset system, extracted by Paul Menage
55
* Copyright (C) 2006 Google, Inc
66
*
7+
* Notifications support
8+
* Copyright (C) 2009 Nokia Corporation
9+
* Author: Kirill A. Shutemov
10+
*
711
* Copyright notices from the original cpuset code:
812
* --------------------------------------------------
913
* Copyright (C) 2003 BULL SA.
@@ -53,6 +57,8 @@
5357
#include <linux/pid_namespace.h>
5458
#include <linux/idr.h>
5559
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60+
#include <linux/eventfd.h>
61+
#include <linux/poll.h>
5662

5763
#include <asm/atomic.h>
5864

@@ -152,6 +158,35 @@ struct css_id {
152158
unsigned short stack[0]; /* Array of Length (depth+1) */
153159
};
154160

161+
/*
162+
* cgroup_event represents events which userspace want to recieve.
163+
*/
164+
struct cgroup_event {
165+
/*
166+
* Cgroup which the event belongs to.
167+
*/
168+
struct cgroup *cgrp;
169+
/*
170+
* Control file which the event associated.
171+
*/
172+
struct cftype *cft;
173+
/*
174+
* eventfd to signal userspace about the event.
175+
*/
176+
struct eventfd_ctx *eventfd;
177+
/*
178+
* Each of these stored in a list by the cgroup.
179+
*/
180+
struct list_head list;
181+
/*
182+
* All fields below needed to unregister event when
183+
* userspace closes eventfd.
184+
*/
185+
poll_table pt;
186+
wait_queue_head_t *wqh;
187+
wait_queue_t wait;
188+
struct work_struct remove;
189+
};
155190

156191
/* The list of hierarchy roots */
157192

@@ -760,14 +795,28 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
760795
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
761796
{
762797
struct cgroup_subsys *ss;
798+
struct cgroup_event *event, *tmp;
763799
int ret = 0;
764800

765801
for_each_subsys(cgrp->root, ss)
766802
if (ss->pre_destroy) {
767803
ret = ss->pre_destroy(ss, cgrp);
768804
if (ret)
769-
break;
805+
goto out;
770806
}
807+
808+
/*
809+
* Unregister events and notify userspace.
810+
*/
811+
spin_lock(&cgrp->event_list_lock);
812+
list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
813+
list_del(&event->list);
814+
eventfd_signal(event->eventfd, 1);
815+
schedule_work(&event->remove);
816+
}
817+
spin_unlock(&cgrp->event_list_lock);
818+
819+
out:
771820
return ret;
772821
}
773822

@@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
12391288
INIT_LIST_HEAD(&cgrp->release_list);
12401289
INIT_LIST_HEAD(&cgrp->pidlists);
12411290
mutex_init(&cgrp->pidlist_mutex);
1291+
INIT_LIST_HEAD(&cgrp->event_list);
1292+
spin_lock_init(&cgrp->event_list_lock);
12421293
}
12431294

12441295
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
20772128
.rename = cgroup_rename,
20782129
};
20792130

2131+
/*
2132+
* Check if a file is a control file
2133+
*/
2134+
static inline struct cftype *__file_cft(struct file *file)
2135+
{
2136+
if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2137+
return ERR_PTR(-EINVAL);
2138+
return __d_cft(file->f_dentry);
2139+
}
2140+
20802141
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
20812142
struct super_block *sb)
20822143
{
@@ -2930,6 +2991,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
29302991
return 0;
29312992
}
29322993

2994+
/*
2995+
* Unregister event and free resources.
2996+
*
2997+
* Gets called from workqueue.
2998+
*/
2999+
static void cgroup_event_remove(struct work_struct *work)
3000+
{
3001+
struct cgroup_event *event = container_of(work, struct cgroup_event,
3002+
remove);
3003+
struct cgroup *cgrp = event->cgrp;
3004+
3005+
/* TODO: check return code */
3006+
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3007+
3008+
eventfd_ctx_put(event->eventfd);
3009+
remove_wait_queue(event->wqh, &event->wait);
3010+
kfree(event);
3011+
}
3012+
3013+
/*
3014+
* Gets called on POLLHUP on eventfd when user closes it.
3015+
*
3016+
* Called with wqh->lock held and interrupts disabled.
3017+
*/
3018+
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3019+
int sync, void *key)
3020+
{
3021+
struct cgroup_event *event = container_of(wait,
3022+
struct cgroup_event, wait);
3023+
struct cgroup *cgrp = event->cgrp;
3024+
unsigned long flags = (unsigned long)key;
3025+
3026+
if (flags & POLLHUP) {
3027+
spin_lock(&cgrp->event_list_lock);
3028+
list_del(&event->list);
3029+
spin_unlock(&cgrp->event_list_lock);
3030+
/*
3031+
* We are in atomic context, but cgroup_event_remove() may
3032+
* sleep, so we have to call it in workqueue.
3033+
*/
3034+
schedule_work(&event->remove);
3035+
}
3036+
3037+
return 0;
3038+
}
3039+
3040+
static void cgroup_event_ptable_queue_proc(struct file *file,
3041+
wait_queue_head_t *wqh, poll_table *pt)
3042+
{
3043+
struct cgroup_event *event = container_of(pt,
3044+
struct cgroup_event, pt);
3045+
3046+
event->wqh = wqh;
3047+
add_wait_queue(wqh, &event->wait);
3048+
}
3049+
3050+
/*
3051+
* Parse input and register new cgroup event handler.
3052+
*
3053+
* Input must be in format '<event_fd> <control_fd> <args>'.
3054+
* Interpretation of args is defined by control file implementation.
3055+
*/
3056+
static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3057+
const char *buffer)
3058+
{
3059+
struct cgroup_event *event = NULL;
3060+
unsigned int efd, cfd;
3061+
struct file *efile = NULL;
3062+
struct file *cfile = NULL;
3063+
char *endp;
3064+
int ret;
3065+
3066+
efd = simple_strtoul(buffer, &endp, 10);
3067+
if (*endp != ' ')
3068+
return -EINVAL;
3069+
buffer = endp + 1;
3070+
3071+
cfd = simple_strtoul(buffer, &endp, 10);
3072+
if ((*endp != ' ') && (*endp != '\0'))
3073+
return -EINVAL;
3074+
buffer = endp + 1;
3075+
3076+
event = kzalloc(sizeof(*event), GFP_KERNEL);
3077+
if (!event)
3078+
return -ENOMEM;
3079+
event->cgrp = cgrp;
3080+
INIT_LIST_HEAD(&event->list);
3081+
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3082+
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3083+
INIT_WORK(&event->remove, cgroup_event_remove);
3084+
3085+
efile = eventfd_fget(efd);
3086+
if (IS_ERR(efile)) {
3087+
ret = PTR_ERR(efile);
3088+
goto fail;
3089+
}
3090+
3091+
event->eventfd = eventfd_ctx_fileget(efile);
3092+
if (IS_ERR(event->eventfd)) {
3093+
ret = PTR_ERR(event->eventfd);
3094+
goto fail;
3095+
}
3096+
3097+
cfile = fget(cfd);
3098+
if (!cfile) {
3099+
ret = -EBADF;
3100+
goto fail;
3101+
}
3102+
3103+
/* the process need read permission on control file */
3104+
ret = file_permission(cfile, MAY_READ);
3105+
if (ret < 0)
3106+
goto fail;
3107+
3108+
event->cft = __file_cft(cfile);
3109+
if (IS_ERR(event->cft)) {
3110+
ret = PTR_ERR(event->cft);
3111+
goto fail;
3112+
}
3113+
3114+
if (!event->cft->register_event || !event->cft->unregister_event) {
3115+
ret = -EINVAL;
3116+
goto fail;
3117+
}
3118+
3119+
ret = event->cft->register_event(cgrp, event->cft,
3120+
event->eventfd, buffer);
3121+
if (ret)
3122+
goto fail;
3123+
3124+
if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3125+
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3126+
ret = 0;
3127+
goto fail;
3128+
}
3129+
3130+
spin_lock(&cgrp->event_list_lock);
3131+
list_add(&event->list, &cgrp->event_list);
3132+
spin_unlock(&cgrp->event_list_lock);
3133+
3134+
fput(cfile);
3135+
fput(efile);
3136+
3137+
return 0;
3138+
3139+
fail:
3140+
if (cfile)
3141+
fput(cfile);
3142+
3143+
if (event && event->eventfd && !IS_ERR(event->eventfd))
3144+
eventfd_ctx_put(event->eventfd);
3145+
3146+
if (!IS_ERR_OR_NULL(efile))
3147+
fput(efile);
3148+
3149+
kfree(event);
3150+
3151+
return ret;
3152+
}
3153+
29333154
/*
29343155
* for the common functions, 'private' gives the type of file
29353156
*/
@@ -2955,6 +3176,11 @@ static struct cftype files[] = {
29553176
.read_u64 = cgroup_read_notify_on_release,
29563177
.write_u64 = cgroup_write_notify_on_release,
29573178
},
3179+
{
3180+
.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3181+
.write_string = cgroup_write_event_control,
3182+
.mode = S_IWUGO,
3183+
},
29583184
};
29593185

29603186
static struct cftype cft_release_agent = {

0 commit comments

Comments
 (0)