| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright(c) 2014 Intel Mobile Communications GmbH |
| 4 | * Copyright(c) 2015 Intel Deutschland GmbH |
| 5 | * |
| 6 | * Author: Johannes Berg <johannes@sipsolutions.net> |
| 7 | */ |
| 8 | #include <linux/module.h> |
| 9 | #include <linux/device.h> |
| 10 | #include <linux/devcoredump.h> |
| 11 | #include <linux/list.h> |
| 12 | #include <linux/slab.h> |
| 13 | #include <linux/fs.h> |
| 14 | #include <linux/workqueue.h> |
| 15 | |
| 16 | static struct class devcd_class; |
| 17 | |
| 18 | /* global disable flag, for security purposes */ |
| 19 | static bool devcd_disabled; |
| 20 | |
| 21 | struct devcd_entry { |
| 22 | struct device devcd_dev; |
| 23 | void *data; |
| 24 | size_t datalen; |
| 25 | /* |
| 26 | * There are 2 races for which mutex is required. |
| 27 | * |
| 28 | * The first race is between device creation and userspace writing to |
| 29 | * schedule immediately destruction. |
| 30 | * |
| 31 | * This race is handled by arming the timer before device creation, but |
| 32 | * when device creation fails the timer still exists. |
| 33 | * |
| 34 | * To solve this, hold the mutex during device_add(), and set |
| 35 | * init_completed on success before releasing the mutex. |
| 36 | * |
| 37 | * That way the timer will never fire until device_add() is called, |
| 38 | * it will do nothing if init_completed is not set. The timer is also |
| 39 | * cancelled in that case. |
| 40 | * |
| 41 | * The second race involves multiple parallel invocations of devcd_free(), |
| 42 | * add a deleted flag so only 1 can call the destructor. |
| 43 | */ |
| 44 | struct mutex mutex; |
| 45 | bool init_completed, deleted; |
| 46 | struct module *owner; |
| 47 | ssize_t (*read)(char *buffer, loff_t offset, size_t count, |
| 48 | void *data, size_t datalen); |
| 49 | void (*free)(void *data); |
| 50 | /* |
| 51 | * If nothing interferes and device_add() was returns success, |
| 52 | * del_wk will destroy the device after the timer fires. |
| 53 | * |
| 54 | * Multiple userspace processes can interfere in the working of the timer: |
| 55 | * - Writing to the coredump will reschedule the timer to run immediately, |
| 56 | * if still armed. |
| 57 | * |
| 58 | * This is handled by using "if (cancel_delayed_work()) { |
| 59 | * schedule_delayed_work() }", to prevent re-arming after having |
| 60 | * been previously fired. |
| 61 | * - Writing to /sys/class/devcoredump/disabled will destroy the |
| 62 | * coredump synchronously. |
| 63 | * This is handled by using disable_delayed_work_sync(), and then |
| 64 | * checking if deleted flag is set with &devcd->mutex held. |
| 65 | */ |
| 66 | struct delayed_work del_wk; |
| 67 | struct device *failing_dev; |
| 68 | }; |
| 69 | |
| 70 | static struct devcd_entry *dev_to_devcd(struct device *dev) |
| 71 | { |
| 72 | return container_of(dev, struct devcd_entry, devcd_dev); |
| 73 | } |
| 74 | |
| 75 | static void devcd_dev_release(struct device *dev) |
| 76 | { |
| 77 | struct devcd_entry *devcd = dev_to_devcd(dev); |
| 78 | |
| 79 | devcd->free(devcd->data); |
| 80 | module_put(module: devcd->owner); |
| 81 | |
| 82 | /* |
| 83 | * this seems racy, but I don't see a notifier or such on |
| 84 | * a struct device to know when it goes away? |
| 85 | */ |
| 86 | if (devcd->failing_dev->kobj.sd) |
| 87 | sysfs_delete_link(dir: &devcd->failing_dev->kobj, targ: &dev->kobj, |
| 88 | name: "devcoredump" ); |
| 89 | |
| 90 | put_device(dev: devcd->failing_dev); |
| 91 | kfree(objp: devcd); |
| 92 | } |
| 93 | |
| 94 | static void __devcd_del(struct devcd_entry *devcd) |
| 95 | { |
| 96 | devcd->deleted = true; |
| 97 | device_del(dev: &devcd->devcd_dev); |
| 98 | put_device(dev: &devcd->devcd_dev); |
| 99 | } |
| 100 | |
| 101 | static void devcd_del(struct work_struct *wk) |
| 102 | { |
| 103 | struct devcd_entry *devcd; |
| 104 | bool init_completed; |
| 105 | |
| 106 | devcd = container_of(wk, struct devcd_entry, del_wk.work); |
| 107 | |
| 108 | /* devcd->mutex serializes against dev_coredumpm_timeout */ |
| 109 | mutex_lock(&devcd->mutex); |
| 110 | init_completed = devcd->init_completed; |
| 111 | mutex_unlock(lock: &devcd->mutex); |
| 112 | |
| 113 | if (init_completed) |
| 114 | __devcd_del(devcd); |
| 115 | } |
| 116 | |
| 117 | static ssize_t devcd_data_read(struct file *filp, struct kobject *kobj, |
| 118 | const struct bin_attribute *bin_attr, |
| 119 | char *buffer, loff_t offset, size_t count) |
| 120 | { |
| 121 | struct device *dev = kobj_to_dev(kobj); |
| 122 | struct devcd_entry *devcd = dev_to_devcd(dev); |
| 123 | |
| 124 | return devcd->read(buffer, offset, count, devcd->data, devcd->datalen); |
| 125 | } |
| 126 | |
| 127 | static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj, |
| 128 | const struct bin_attribute *bin_attr, |
| 129 | char *buffer, loff_t offset, size_t count) |
| 130 | { |
| 131 | struct device *dev = kobj_to_dev(kobj); |
| 132 | struct devcd_entry *devcd = dev_to_devcd(dev); |
| 133 | |
| 134 | /* |
| 135 | * Although it's tempting to use mod_delayed work here, |
| 136 | * that will cause a reschedule if the timer already fired. |
| 137 | */ |
| 138 | if (cancel_delayed_work(dwork: &devcd->del_wk)) |
| 139 | schedule_delayed_work(dwork: &devcd->del_wk, delay: 0); |
| 140 | |
| 141 | return count; |
| 142 | } |
| 143 | |
| 144 | static const struct bin_attribute devcd_attr_data = |
| 145 | __BIN_ATTR(data, 0600, devcd_data_read, devcd_data_write, 0); |
| 146 | |
| 147 | static const struct bin_attribute *const devcd_dev_bin_attrs[] = { |
| 148 | &devcd_attr_data, NULL, |
| 149 | }; |
| 150 | |
| 151 | static const struct attribute_group devcd_dev_group = { |
| 152 | .bin_attrs = devcd_dev_bin_attrs, |
| 153 | }; |
| 154 | |
| 155 | static const struct attribute_group *devcd_dev_groups[] = { |
| 156 | &devcd_dev_group, NULL, |
| 157 | }; |
| 158 | |
| 159 | static int devcd_free(struct device *dev, void *data) |
| 160 | { |
| 161 | struct devcd_entry *devcd = dev_to_devcd(dev); |
| 162 | |
| 163 | /* |
| 164 | * To prevent a race with devcd_data_write(), disable work and |
| 165 | * complete manually instead. |
| 166 | * |
| 167 | * We cannot rely on the return value of |
| 168 | * disable_delayed_work_sync() here, because it might be in the |
| 169 | * middle of a cancel_delayed_work + schedule_delayed_work pair. |
| 170 | * |
| 171 | * devcd->mutex here guards against multiple parallel invocations |
| 172 | * of devcd_free(). |
| 173 | */ |
| 174 | disable_delayed_work_sync(dwork: &devcd->del_wk); |
| 175 | mutex_lock(&devcd->mutex); |
| 176 | if (!devcd->deleted) |
| 177 | __devcd_del(devcd); |
| 178 | mutex_unlock(lock: &devcd->mutex); |
| 179 | return 0; |
| 180 | } |
| 181 | |
| 182 | static ssize_t disabled_show(const struct class *class, const struct class_attribute *attr, |
| 183 | char *buf) |
| 184 | { |
| 185 | return sysfs_emit(buf, fmt: "%d\n" , devcd_disabled); |
| 186 | } |
| 187 | |
| 188 | /* |
| 189 | * |
| 190 | * disabled_store() worker() |
| 191 | * class_for_each_device(&devcd_class, |
| 192 | * NULL, NULL, devcd_free) |
| 193 | * ... |
| 194 | * ... |
| 195 | * while ((dev = class_dev_iter_next(&iter)) |
| 196 | * devcd_del() |
| 197 | * device_del() |
| 198 | * put_device() <- last reference |
| 199 | * error = fn(dev, data) devcd_dev_release() |
| 200 | * devcd_free(dev, data) kfree(devcd) |
| 201 | * |
| 202 | * |
| 203 | * In the above diagram, it looks like disabled_store() would be racing with parallelly |
| 204 | * running devcd_del() and result in memory abort after dropping its last reference with |
| 205 | * put_device(). However, this will not happens as fn(dev, data) runs |
| 206 | * with its own reference to device via klist_node so it is not its last reference. |
| 207 | * so, above situation would not occur. |
| 208 | */ |
| 209 | |
| 210 | static ssize_t disabled_store(const struct class *class, const struct class_attribute *attr, |
| 211 | const char *buf, size_t count) |
| 212 | { |
| 213 | long tmp = simple_strtol(buf, NULL, 10); |
| 214 | |
| 215 | /* |
| 216 | * This essentially makes the attribute write-once, since you can't |
| 217 | * go back to not having it disabled. This is intentional, it serves |
| 218 | * as a system lockdown feature. |
| 219 | */ |
| 220 | if (tmp != 1) |
| 221 | return -EINVAL; |
| 222 | |
| 223 | devcd_disabled = true; |
| 224 | |
| 225 | class_for_each_device(class: &devcd_class, NULL, NULL, fn: devcd_free); |
| 226 | |
| 227 | return count; |
| 228 | } |
| 229 | static CLASS_ATTR_RW(disabled); |
| 230 | |
| 231 | static struct attribute *devcd_class_attrs[] = { |
| 232 | &class_attr_disabled.attr, |
| 233 | NULL, |
| 234 | }; |
| 235 | ATTRIBUTE_GROUPS(devcd_class); |
| 236 | |
| 237 | static struct class devcd_class = { |
| 238 | .name = "devcoredump" , |
| 239 | .dev_release = devcd_dev_release, |
| 240 | .dev_groups = devcd_dev_groups, |
| 241 | .class_groups = devcd_class_groups, |
| 242 | }; |
| 243 | |
| 244 | static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count, |
| 245 | void *data, size_t datalen) |
| 246 | { |
| 247 | return memory_read_from_buffer(to: buffer, count, ppos: &offset, from: data, available: datalen); |
| 248 | } |
| 249 | |
| 250 | static void devcd_freev(void *data) |
| 251 | { |
| 252 | vfree(addr: data); |
| 253 | } |
| 254 | |
| 255 | /** |
| 256 | * dev_coredumpv - create device coredump with vmalloc data |
| 257 | * @dev: the struct device for the crashed device |
| 258 | * @data: vmalloc data containing the device coredump |
| 259 | * @datalen: length of the data |
| 260 | * @gfp: allocation flags |
| 261 | * |
| 262 | * This function takes ownership of the vmalloc'ed data and will free |
| 263 | * it when it is no longer used. See dev_coredumpm() for more information. |
| 264 | */ |
| 265 | void dev_coredumpv(struct device *dev, void *data, size_t datalen, |
| 266 | gfp_t gfp) |
| 267 | { |
| 268 | dev_coredumpm(dev, NULL, data, datalen, gfp, read: devcd_readv, free: devcd_freev); |
| 269 | } |
| 270 | EXPORT_SYMBOL_GPL(dev_coredumpv); |
| 271 | |
| 272 | static int devcd_match_failing(struct device *dev, const void *failing) |
| 273 | { |
| 274 | struct devcd_entry *devcd = dev_to_devcd(dev); |
| 275 | |
| 276 | return devcd->failing_dev == failing; |
| 277 | } |
| 278 | |
| 279 | /** |
| 280 | * devcd_free_sgtable - free all the memory of the given scatterlist table |
| 281 | * (i.e. both pages and scatterlist instances) |
| 282 | * NOTE: if two tables allocated with devcd_alloc_sgtable and then chained |
| 283 | * using the sg_chain function then that function should be called only once |
| 284 | * on the chained table |
| 285 | * @data: pointer to sg_table to free |
| 286 | */ |
| 287 | static void devcd_free_sgtable(void *data) |
| 288 | { |
| 289 | _devcd_free_sgtable(table: data); |
| 290 | } |
| 291 | |
| 292 | /** |
| 293 | * devcd_read_from_sgtable - copy data from sg_table to a given buffer |
| 294 | * and return the number of bytes read |
| 295 | * @buffer: the buffer to copy the data to it |
| 296 | * @buf_len: the length of the buffer |
| 297 | * @data: the scatterlist table to copy from |
| 298 | * @offset: start copy from @offset@ bytes from the head of the data |
| 299 | * in the given scatterlist |
| 300 | * @data_len: the length of the data in the sg_table |
| 301 | * |
| 302 | * Returns: the number of bytes copied |
| 303 | */ |
| 304 | static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset, |
| 305 | size_t buf_len, void *data, |
| 306 | size_t data_len) |
| 307 | { |
| 308 | struct scatterlist *table = data; |
| 309 | |
| 310 | if (offset > data_len) |
| 311 | return -EINVAL; |
| 312 | |
| 313 | if (offset + buf_len > data_len) |
| 314 | buf_len = data_len - offset; |
| 315 | return sg_pcopy_to_buffer(sgl: table, nents: sg_nents(sg: table), buf: buffer, buflen: buf_len, |
| 316 | skip: offset); |
| 317 | } |
| 318 | |
| 319 | /** |
| 320 | * dev_coredump_put - remove device coredump |
| 321 | * @dev: the struct device for the crashed device |
| 322 | * |
| 323 | * dev_coredump_put() removes coredump, if exists, for a given device from |
| 324 | * the file system and free its associated data otherwise, does nothing. |
| 325 | * |
| 326 | * It is useful for modules that do not want to keep coredump |
| 327 | * available after its unload. |
| 328 | */ |
| 329 | void dev_coredump_put(struct device *dev) |
| 330 | { |
| 331 | struct device *existing; |
| 332 | |
| 333 | existing = class_find_device(class: &devcd_class, NULL, data: dev, |
| 334 | match: devcd_match_failing); |
| 335 | if (existing) { |
| 336 | devcd_free(dev: existing, NULL); |
| 337 | put_device(dev: existing); |
| 338 | } |
| 339 | } |
| 340 | EXPORT_SYMBOL_GPL(dev_coredump_put); |
| 341 | |
| 342 | /** |
| 343 | * dev_coredumpm_timeout - create device coredump with read/free methods with a |
| 344 | * custom timeout. |
| 345 | * @dev: the struct device for the crashed device |
| 346 | * @owner: the module that contains the read/free functions, use %THIS_MODULE |
| 347 | * @data: data cookie for the @read/@free functions |
| 348 | * @datalen: length of the data |
| 349 | * @gfp: allocation flags |
| 350 | * @read: function to read from the given buffer |
| 351 | * @free: function to free the given buffer |
| 352 | * @timeout: time in jiffies to remove coredump |
| 353 | * |
| 354 | * Creates a new device coredump for the given device. If a previous one hasn't |
| 355 | * been read yet, the new coredump is discarded. The data lifetime is determined |
| 356 | * by the device coredump framework and when it is no longer needed the @free |
| 357 | * function will be called to free the data. |
| 358 | */ |
| 359 | void dev_coredumpm_timeout(struct device *dev, struct module *owner, |
| 360 | void *data, size_t datalen, gfp_t gfp, |
| 361 | ssize_t (*read)(char *buffer, loff_t offset, |
| 362 | size_t count, void *data, |
| 363 | size_t datalen), |
| 364 | void (*free)(void *data), |
| 365 | unsigned long timeout) |
| 366 | { |
| 367 | static atomic_t devcd_count = ATOMIC_INIT(0); |
| 368 | struct devcd_entry *devcd; |
| 369 | struct device *existing; |
| 370 | |
| 371 | if (devcd_disabled) |
| 372 | goto free; |
| 373 | |
| 374 | existing = class_find_device(class: &devcd_class, NULL, data: dev, |
| 375 | match: devcd_match_failing); |
| 376 | if (existing) { |
| 377 | put_device(dev: existing); |
| 378 | goto free; |
| 379 | } |
| 380 | |
| 381 | if (!try_module_get(module: owner)) |
| 382 | goto free; |
| 383 | |
| 384 | devcd = kzalloc(sizeof(*devcd), gfp); |
| 385 | if (!devcd) |
| 386 | goto put_module; |
| 387 | |
| 388 | devcd->owner = owner; |
| 389 | devcd->data = data; |
| 390 | devcd->datalen = datalen; |
| 391 | devcd->read = read; |
| 392 | devcd->free = free; |
| 393 | devcd->failing_dev = get_device(dev); |
| 394 | devcd->deleted = false; |
| 395 | |
| 396 | mutex_init(&devcd->mutex); |
| 397 | device_initialize(dev: &devcd->devcd_dev); |
| 398 | |
| 399 | dev_set_name(dev: &devcd->devcd_dev, name: "devcd%d" , |
| 400 | atomic_inc_return(v: &devcd_count)); |
| 401 | devcd->devcd_dev.class = &devcd_class; |
| 402 | |
| 403 | dev_set_uevent_suppress(dev: &devcd->devcd_dev, val: true); |
| 404 | |
| 405 | /* devcd->mutex prevents devcd_del() completing until init finishes */ |
| 406 | mutex_lock(&devcd->mutex); |
| 407 | devcd->init_completed = false; |
| 408 | INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); |
| 409 | schedule_delayed_work(dwork: &devcd->del_wk, delay: timeout); |
| 410 | |
| 411 | if (device_add(dev: &devcd->devcd_dev)) |
| 412 | goto put_device; |
| 413 | |
| 414 | /* |
| 415 | * These should normally not fail, but there is no problem |
| 416 | * continuing without the links, so just warn instead of |
| 417 | * failing. |
| 418 | */ |
| 419 | if (sysfs_create_link(kobj: &devcd->devcd_dev.kobj, target: &dev->kobj, |
| 420 | name: "failing_device" ) || |
| 421 | sysfs_create_link(kobj: &dev->kobj, target: &devcd->devcd_dev.kobj, |
| 422 | name: "devcoredump" )) |
| 423 | dev_warn(dev, "devcoredump create_link failed\n" ); |
| 424 | |
| 425 | dev_set_uevent_suppress(dev: &devcd->devcd_dev, val: false); |
| 426 | kobject_uevent(kobj: &devcd->devcd_dev.kobj, action: KOBJ_ADD); |
| 427 | |
| 428 | /* |
| 429 | * Safe to run devcd_del() now that we are done with devcd_dev. |
| 430 | * Alternatively we could have taken a ref on devcd_dev before |
| 431 | * dropping the lock. |
| 432 | */ |
| 433 | devcd->init_completed = true; |
| 434 | mutex_unlock(lock: &devcd->mutex); |
| 435 | return; |
| 436 | put_device: |
| 437 | mutex_unlock(lock: &devcd->mutex); |
| 438 | cancel_delayed_work_sync(dwork: &devcd->del_wk); |
| 439 | put_device(dev: &devcd->devcd_dev); |
| 440 | |
| 441 | put_module: |
| 442 | module_put(module: owner); |
| 443 | free: |
| 444 | free(data); |
| 445 | } |
| 446 | EXPORT_SYMBOL_GPL(dev_coredumpm_timeout); |
| 447 | |
| 448 | /** |
| 449 | * dev_coredumpsg - create device coredump that uses scatterlist as data |
| 450 | * parameter |
| 451 | * @dev: the struct device for the crashed device |
| 452 | * @table: the dump data |
| 453 | * @datalen: length of the data |
| 454 | * @gfp: allocation flags |
| 455 | * |
| 456 | * Creates a new device coredump for the given device. If a previous one hasn't |
| 457 | * been read yet, the new coredump is discarded. The data lifetime is determined |
| 458 | * by the device coredump framework and when it is no longer needed |
| 459 | * it will free the data. |
| 460 | */ |
| 461 | void dev_coredumpsg(struct device *dev, struct scatterlist *table, |
| 462 | size_t datalen, gfp_t gfp) |
| 463 | { |
| 464 | dev_coredumpm(dev, NULL, data: table, datalen, gfp, read: devcd_read_from_sgtable, |
| 465 | free: devcd_free_sgtable); |
| 466 | } |
| 467 | EXPORT_SYMBOL_GPL(dev_coredumpsg); |
| 468 | |
| 469 | static int __init devcoredump_init(void) |
| 470 | { |
| 471 | return class_register(class: &devcd_class); |
| 472 | } |
| 473 | __initcall(devcoredump_init); |
| 474 | |
| 475 | static void __exit devcoredump_exit(void) |
| 476 | { |
| 477 | class_for_each_device(class: &devcd_class, NULL, NULL, fn: devcd_free); |
| 478 | class_unregister(class: &devcd_class); |
| 479 | } |
| 480 | __exitcall(devcoredump_exit); |
| 481 | |