Skip to content

Commit bea8c15

Browse files
Hugh Dickinstorvalds
authored andcommitted
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 18f6942 commit bea8c15

4 files changed

Lines changed: 38 additions & 18 deletions

File tree

include/linux/mmzone.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,7 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
752752
unsigned long size,
753753
enum memmap_context context);
754754

755-
extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
755+
extern void lruvec_init(struct lruvec *lruvec);
756756

757757
static inline struct zone *lruvec_zone(struct lruvec *lruvec)
758758
{

mm/memcontrol.c

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
10551055
struct mem_cgroup *memcg)
10561056
{
10571057
struct mem_cgroup_per_zone *mz;
1058+
struct lruvec *lruvec;
10581059

1059-
if (mem_cgroup_disabled())
1060-
return &zone->lruvec;
1060+
if (mem_cgroup_disabled()) {
1061+
lruvec = &zone->lruvec;
1062+
goto out;
1063+
}
10611064

10621065
mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1063-
return &mz->lruvec;
1066+
lruvec = &mz->lruvec;
1067+
out:
1068+
/*
1069+
* Since a node can be onlined after the mem_cgroup was created,
1070+
* we have to be prepared to initialize lruvec->zone here;
1071+
* and if offlined then reonlined, we need to reinitialize it.
1072+
*/
1073+
if (unlikely(lruvec->zone != zone))
1074+
lruvec->zone = zone;
1075+
return lruvec;
10641076
}
10651077

10661078
/*
@@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
10871099
struct mem_cgroup_per_zone *mz;
10881100
struct mem_cgroup *memcg;
10891101
struct page_cgroup *pc;
1102+
struct lruvec *lruvec;
10901103

1091-
if (mem_cgroup_disabled())
1092-
return &zone->lruvec;
1104+
if (mem_cgroup_disabled()) {
1105+
lruvec = &zone->lruvec;
1106+
goto out;
1107+
}
10931108

10941109
pc = lookup_page_cgroup(page);
10951110
memcg = pc->mem_cgroup;
@@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
11071122
pc->mem_cgroup = memcg = root_mem_cgroup;
11081123

11091124
mz = page_cgroup_zoneinfo(memcg, page);
1110-
return &mz->lruvec;
1125+
lruvec = &mz->lruvec;
1126+
out:
1127+
/*
1128+
* Since a node can be onlined after the mem_cgroup was created,
1129+
* we have to be prepared to initialize lruvec->zone here;
1130+
* and if offlined then reonlined, we need to reinitialize it.
1131+
*/
1132+
if (unlikely(lruvec->zone != zone))
1133+
lruvec->zone = zone;
1134+
return lruvec;
11111135
}
11121136

11131137
/**
@@ -3697,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
36973721
static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
36983722
int node, int zid, enum lru_list lru)
36993723
{
3700-
struct mem_cgroup_per_zone *mz;
3724+
struct lruvec *lruvec;
37013725
unsigned long flags, loop;
37023726
struct list_head *list;
37033727
struct page *busy;
37043728
struct zone *zone;
37053729

37063730
zone = &NODE_DATA(node)->node_zones[zid];
3707-
mz = mem_cgroup_zoneinfo(memcg, node, zid);
3708-
list = &mz->lruvec.lists[lru];
3731+
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3732+
list = &lruvec->lists[lru];
37093733

3710-
loop = mz->lru_size[lru];
3734+
loop = mem_cgroup_get_lru_size(lruvec, lru);
37113735
/* give some margin against EBUSY etc...*/
37123736
loop += 256;
37133737
busy = NULL;
@@ -4745,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
47454769

47464770
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
47474771
mz = &pn->zoneinfo[zone];
4748-
lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
4772+
lruvec_init(&mz->lruvec);
47494773
mz->usage_in_excess = 0;
47504774
mz->on_tree = false;
47514775
mz->memcg = memcg;

mm/mmzone.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,16 +87,12 @@ int memmap_valid_within(unsigned long pfn,
8787
}
8888
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
8989

90-
void lruvec_init(struct lruvec *lruvec, struct zone *zone)
90+
void lruvec_init(struct lruvec *lruvec)
9191
{
9292
enum lru_list lru;
9393

9494
memset(lruvec, 0, sizeof(struct lruvec));
9595

9696
for_each_lru(lru)
9797
INIT_LIST_HEAD(&lruvec->lists[lru]);
98-
99-
#ifdef CONFIG_MEMCG
100-
lruvec->zone = zone;
101-
#endif
10298
}

mm/page_alloc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4505,7 +4505,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
45054505
zone->zone_pgdat = pgdat;
45064506

45074507
zone_pcp_init(zone);
4508-
lruvec_init(&zone->lruvec, zone);
4508+
lruvec_init(&zone->lruvec);
45094509
if (!size)
45104510
continue;
45114511

0 commit comments

Comments
 (0)