device.c source code [linux/drivers/accel/habanalabs/common/device.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	/*
4	* Copyright 2016-2022 HabanaLabs, Ltd.
5	* All Rights Reserved.
6	*/
7
8	#define pr_fmt(fmt) "habanalabs: " fmt
9
10	#include <uapi/drm/habanalabs_accel.h>
11	#include "habanalabs.h"
12
13	#include <linux/pci.h>
14	#include <linux/hwmon.h>
15	#include <linux/vmalloc.h>
16
17	#include <drm/drm_accel.h>
18	#include <drm/drm_drv.h>
19
20	#include <trace/events/habanalabs.h>
21
22	#define HL_RESET_DELAY_USEC 10000 /* 10ms */
23
24	#define HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC 30
25
26	enum dma_alloc_type {
27	DMA_ALLOC_COHERENT,
28	DMA_ALLOC_POOL,
29	};
30
31	#define MEM_SCRUB_DEFAULT_VAL 0x1122334455667788
32
33	static void hl_device_heartbeat(struct work_struct *work);
34
35	/*
36	* hl_set_dram_bar- sets the bar to allow later access to address
37	*
38	* @hdev: pointer to habanalabs device structure.
39	* @addr: the address the caller wants to access.
40	* @region: the PCI region.
41	* @new_bar_region_base: the new BAR region base address.
42	*
43	* @return: the old BAR base address on success, U64_MAX for failure.
44	* The caller should set it back to the old address after use.
45	*
46	* In case the bar space does not cover the whole address space,
47	* the bar base address should be set to allow access to a given address.
48	* This function can be called also if the bar doesn't need to be set,
49	* in that case it just won't change the base.
50	*/
51	static u64 hl_set_dram_bar(struct hl_device hdev, u64 addr, struct* pci_mem_region *region,
52	u64 *new_bar_region_base)
53	{
54	struct asic_fixed_properties *prop = &hdev->asic_prop;
55	u64 bar_base_addr, old_base;
56
57	if (is_power_of_2(n: prop->dram_pci_bar_size))
58	bar_base_addr = addr & ~(prop->dram_pci_bar_size - `0x1ull`);
59	else
60	bar_base_addr = region->region_base +
61	div64_u64(dividend: (addr - region->region_base), divisor: prop->dram_pci_bar_size) *
62	prop->dram_pci_bar_size;
63
64	old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
65
66	/ in case of success we need to update the new BAR base /
67	if ((old_base != U64_MAX) && new_bar_region_base)
68	*new_bar_region_base = bar_base_addr;
69
70	return old_base;
71	}
72
73	int hl_access_sram_dram_region(struct hl_device hdev, u64 addr, u64 val,
74	enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar)
75	{
76	struct pci_mem_region *region = &hdev->pci_mem_region[region_type];
77	u64 old_base = `0`, rc, bar_region_base = region->region_base;
78	void __iomem *acc_addr;
79
80	if (set_dram_bar) {
81	old_base = hl_set_dram_bar(hdev, addr, region, new_bar_region_base: &bar_region_base);
82	if (old_base == U64_MAX)
83	return -EIO;
84	}
85
86	acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
87	(addr - bar_region_base);
88
89	switch (acc_type) {
90	case DEBUGFS_READ8:
91	*val = readb(addr: acc_addr);
92	break;
93	case DEBUGFS_WRITE8:
94	writeb(val: *val, addr: acc_addr);
95	break;
96	case DEBUGFS_READ32:
97	*val = readl(addr: acc_addr);
98	break;
99	case DEBUGFS_WRITE32:
100	writel(val: *val, addr: acc_addr);
101	break;
102	case DEBUGFS_READ64:
103	*val = readq(addr: acc_addr);
104	break;
105	case DEBUGFS_WRITE64:
106	writeq(val: *val, addr: acc_addr);
107	break;
108	}
109
110	if (set_dram_bar) {
111	rc = hl_set_dram_bar(hdev, addr: old_base, region, NULL);
112	if (rc == U64_MAX)
113	return -EIO;
114	}
115
116	return `0`;
117	}
118
119	static void hl_dma_alloc_common(struct* hl_device hdev, size_t size, dma_addr_t dma_handle,
120	gfp_t flag, enum dma_alloc_type alloc_type,
121	const char *caller)
122	{
123	void *ptr = NULL;
124
125	switch (alloc_type) {
126	case DMA_ALLOC_COHERENT:
127	ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag);
128	break;
129	case DMA_ALLOC_POOL:
130	ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle);
131	break;
132	}
133
134	if (trace_habanalabs_dma_alloc_enabled() && !ZERO_OR_NULL_PTR(ptr))
135	trace_habanalabs_dma_alloc(dev: &(hdev)->pdev->dev, cpu_addr: (u64) (uintptr_t) ptr, dma_addr: *dma_handle,
136	size, caller);
137
138	return ptr;
139	}
140
141	static void hl_asic_dma_free_common(struct hl_device hdev, size_t size, void* *cpu_addr,
142	dma_addr_t dma_handle, enum dma_alloc_type alloc_type,
143	const char *caller)
144	{
145	/ this is needed to avoid warning on using freed pointer /
146	u64 store_cpu_addr = (u64) (uintptr_t) cpu_addr;
147
148	switch (alloc_type) {
149	case DMA_ALLOC_COHERENT:
150	hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle);
151	break;
152	case DMA_ALLOC_POOL:
153	hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle);
154	break;
155	}
156
157	trace_habanalabs_dma_free(dev: &(hdev)->pdev->dev, cpu_addr: store_cpu_addr, dma_addr: dma_handle, size, caller);
158	}
159
160	void hl_asic_dma_alloc_coherent_caller(struct* hl_device hdev, size_t size, dma_addr_t dma_handle,
161	gfp_t flag, const char *caller)
162	{
163	return hl_dma_alloc_common(hdev, size, dma_handle, flag, alloc_type: DMA_ALLOC_COHERENT, caller);
164	}
165
166	void hl_asic_dma_free_coherent_caller(struct hl_device hdev, size_t size, void* *cpu_addr,
167	dma_addr_t dma_handle, const char *caller)
168	{
169	hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, alloc_type: DMA_ALLOC_COHERENT, caller);
170	}
171
172	void hl_asic_dma_pool_zalloc_caller(struct* hl_device *hdev, size_t size, gfp_t mem_flags,
173	dma_addr_t dma_handle, const* char *caller)
174	{
175	return hl_dma_alloc_common(hdev, size, dma_handle, flag: mem_flags, alloc_type: DMA_ALLOC_POOL, caller);
176	}
177
178	void hl_asic_dma_pool_free_caller(struct hl_device hdev, void* *vaddr, dma_addr_t dma_addr,
179	const char *caller)
180	{
181	hl_asic_dma_free_common(hdev, size: `0`, cpu_addr: vaddr, dma_handle: dma_addr, alloc_type: DMA_ALLOC_POOL, caller);
182	}
183
184	void hl_cpu_accessible_dma_pool_alloc(struct* hl_device hdev, size_t size, dma_addr_t dma_handle)
185	{
186	return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle);
187	}
188
189	void hl_cpu_accessible_dma_pool_free(struct hl_device hdev, size_t size, void* *vaddr)
190	{
191	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr);
192	}
193
194	int hl_dma_map_sgtable_caller(struct hl_device hdev, struct* sg_table *sgt,
195	enum dma_data_direction dir, const char *caller)
196	{
197	struct asic_fixed_properties *prop = &hdev->asic_prop;
198	struct scatterlist *sg;
199	int rc, i;
200
201	rc = hdev->asic_funcs->dma_map_sgtable(hdev, sgt, dir);
202	if (rc)
203	return rc;
204
205	if (!trace_habanalabs_dma_map_page_enabled())
206	return `0`;
207
208	for_each_sgtable_dma_sg(sgt, sg, i)
209	trace_habanalabs_dma_map_page(dev: &(hdev)->pdev->dev,
210	page_to_phys(sg_page(sg)),
211	dma_addr: sg->dma_address - prop->device_dma_offset_for_host_access,
212	#ifdef CONFIG_NEED_SG_DMA_LENGTH
213	len: sg->dma_length,
214	#else
215	sg->length,
216	#endif
217	dir, caller);
218
219	return `0`;
220	}
221
222	int hl_asic_dma_map_sgtable(struct hl_device hdev, struct* sg_table *sgt,
223	enum dma_data_direction dir)
224	{
225	struct asic_fixed_properties *prop = &hdev->asic_prop;
226	struct scatterlist *sg;
227	int rc, i;
228
229	rc = dma_map_sgtable(dev: &hdev->pdev->dev, sgt, dir, attrs: `0`);
230	if (rc)
231	return rc;
232
233	/ Shift to the device's base physical address of host memory if necessary /
234	if (prop->device_dma_offset_for_host_access)
235	for_each_sgtable_dma_sg(sgt, sg, i)
236	sg->dma_address += prop->device_dma_offset_for_host_access;
237
238	return `0`;
239	}
240
241	void hl_dma_unmap_sgtable_caller(struct hl_device hdev, struct* sg_table *sgt,
242	enum dma_data_direction dir, const char *caller)
243	{
244	struct asic_fixed_properties *prop = &hdev->asic_prop;
245	struct scatterlist *sg;
246	int i;
247
248	hdev->asic_funcs->dma_unmap_sgtable(hdev, sgt, dir);
249
250	if (trace_habanalabs_dma_unmap_page_enabled()) {
251	for_each_sgtable_dma_sg(sgt, sg, i)
252	trace_habanalabs_dma_unmap_page(dev: &(hdev)->pdev->dev,
253	page_to_phys(sg_page(sg)),
254	dma_addr: sg->dma_address - prop->device_dma_offset_for_host_access,
255	#ifdef CONFIG_NEED_SG_DMA_LENGTH
256	len: sg->dma_length,
257	#else
258	sg->length,
259	#endif
260	dir, caller);
261	}
262	}
263
264	void hl_asic_dma_unmap_sgtable(struct hl_device hdev, struct* sg_table *sgt,
265	enum dma_data_direction dir)
266	{
267	struct asic_fixed_properties *prop = &hdev->asic_prop;
268	struct scatterlist *sg;
269	int i;
270
271	/ Cancel the device's base physical address of host memory if necessary /
272	if (prop->device_dma_offset_for_host_access)
273	for_each_sgtable_dma_sg(sgt, sg, i)
274	sg->dma_address -= prop->device_dma_offset_for_host_access;
275
276	dma_unmap_sgtable(dev: &hdev->pdev->dev, sgt, dir, attrs: `0`);
277	}
278
279	/*
280	* hl_access_cfg_region - access the config region
281	*
282	* @hdev: pointer to habanalabs device structure
283	* @addr: the address to access
284	* @val: the value to write from or read to
285	* @acc_type: the type of access (read/write 64/32)
286	*/
287	int hl_access_cfg_region(struct hl_device hdev, u64 addr, u64 val,
288	enum debugfs_access_type acc_type)
289	{
290	struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG];
291	u32 val_h, val_l;
292
293	if (!IS_ALIGNED(addr, sizeof(u32))) {
294	dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32));
295	return -EINVAL;
296	}
297
298	switch (acc_type) {
299	case DEBUGFS_READ32:
300	*val = RREG32(addr - cfg_region->region_base);
301	break;
302	case DEBUGFS_WRITE32:
303	WREG32(addr - cfg_region->region_base, *val);
304	break;
305	case DEBUGFS_READ64:
306	val_l = RREG32(addr - cfg_region->region_base);
307	val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base);
308
309	*val = (((u64) val_h) << `32`) \| val_l;
310	break;
311	case DEBUGFS_WRITE64:
312	WREG32(addr - cfg_region->region_base, lower_32_bits(*val));
313	WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val));
314	break;
315	default:
316	dev_err(hdev->dev, "access type %d is not supported\n", acc_type);
317	return -EOPNOTSUPP;
318	}
319
320	return `0`;
321	}
322
323	/*
324	* hl_access_dev_mem - access device memory
325	*
326	* @hdev: pointer to habanalabs device structure
327	* @region_type: the type of the region the address belongs to
328	* @addr: the address to access
329	* @val: the value to write from or read to
330	* @acc_type: the type of access (r/w, 32/64)
331	*/
332	int hl_access_dev_mem(struct hl_device hdev, enum* pci_region region_type,
333	u64 addr, u64 val, enum* debugfs_access_type acc_type)
334	{
335	switch (region_type) {
336	case PCI_REGION_CFG:
337	return hl_access_cfg_region(hdev, addr, val, acc_type);
338	case PCI_REGION_SRAM:
339	case PCI_REGION_DRAM:
340	return hl_access_sram_dram_region(hdev, addr, val, acc_type,
341	region_type, set_dram_bar: (region_type == PCI_REGION_DRAM));
342	default:
343	return -EFAULT;
344	}
345
346	return `0`;
347	}
348
349	void hl_engine_data_sprintf(struct engines_data e, const* char *fmt, ...)
350	{
351	va_list args;
352	int str_size;
353
354	va_start(args, fmt);
355	/ Calculate formatted string length. Assuming each string is null terminated, hence*
356	* increment result by 1
357	*/
358	str_size = vsnprintf(NULL, size: `0`, fmt, args) + `1`;
359	va_end(args);
360
361	if ((e->actual_size + str_size) < e->allocated_buf_size) {
362	va_start(args, fmt);
363	vsnprintf(buf: e->buf + e->actual_size, size: str_size, fmt, args);
364	va_end(args);
365	}
366
367	/ Need to update the size even when not updating destination buffer to get the exact size*
368	* of all input strings
369	*/
370	e->actual_size += str_size;
371	}
372
373	enum hl_device_status hl_device_status(struct hl_device *hdev)
374	{
375	enum hl_device_status status;
376
377	if (hdev->device_fini_pending) {
378	status = HL_DEVICE_STATUS_MALFUNCTION;
379	} else if (hdev->reset_info.in_reset) {
380	if (hdev->reset_info.in_compute_reset)
381	status = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE;
382	else
383	status = HL_DEVICE_STATUS_IN_RESET;
384	} else if (hdev->reset_info.needs_reset) {
385	status = HL_DEVICE_STATUS_NEEDS_RESET;
386	} else if (hdev->disabled) {
387	status = HL_DEVICE_STATUS_MALFUNCTION;
388	} else if (!hdev->init_done) {
389	status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
390	} else {
391	status = HL_DEVICE_STATUS_OPERATIONAL;
392	}
393
394	return status;
395	}
396
397	bool hl_device_operational(struct hl_device *hdev,
398	enum hl_device_status *status)
399	{
400	enum hl_device_status current_status;
401
402	current_status = hl_device_status(hdev);
403	if (status)
404	*status = current_status;
405
406	switch (current_status) {
407	case HL_DEVICE_STATUS_MALFUNCTION:
408	case HL_DEVICE_STATUS_IN_RESET:
409	case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE:
410	case HL_DEVICE_STATUS_NEEDS_RESET:
411	return false;
412	case HL_DEVICE_STATUS_OPERATIONAL:
413	case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
414	default:
415	return true;
416	}
417	}
418
419	bool hl_ctrl_device_operational(struct hl_device *hdev,
420	enum hl_device_status *status)
421	{
422	enum hl_device_status current_status;
423
424	current_status = hl_device_status(hdev);
425	if (status)
426	*status = current_status;
427
428	switch (current_status) {
429	case HL_DEVICE_STATUS_MALFUNCTION:
430	return false;
431	case HL_DEVICE_STATUS_IN_RESET:
432	case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE:
433	case HL_DEVICE_STATUS_NEEDS_RESET:
434	case HL_DEVICE_STATUS_OPERATIONAL:
435	case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
436	default:
437	return true;
438	}
439	}
440
441	static void print_idle_status_mask(struct hl_device hdev, const* char *message,
442	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE])
443	{
444	if (idle_mask[`3`])
445	dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx_%016llx)\n",
446	dev_name(&hdev->pdev->dev), message,
447	idle_mask[`3`], idle_mask[`2`], idle_mask[`1`], idle_mask[`0`]);
448	else if (idle_mask[`2`])
449	dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx)\n",
450	dev_name(&hdev->pdev->dev), message,
451	idle_mask[`2`], idle_mask[`1`], idle_mask[`0`]);
452	else if (idle_mask[`1`])
453	dev_err(hdev->dev, "%s %s (mask %#llx_%016llx)\n",
454	dev_name(&hdev->pdev->dev), message, idle_mask[`1`], idle_mask[`0`]);
455	else
456	dev_err(hdev->dev, "%s %s (mask %#llx)\n", dev_name(&hdev->pdev->dev), message,
457	idle_mask[`0`]);
458	}
459
460	static void hpriv_release(struct kref *ref)
461	{
462	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {`0`};
463	bool reset_device, device_is_idle = true;
464	struct hl_fpriv *hpriv;
465	struct hl_device *hdev;
466
467	hpriv = container_of(ref, struct hl_fpriv, refcount);
468
469	hdev = hpriv->hdev;
470
471	hdev->asic_funcs->send_device_activity(hdev, false);
472
473	hl_debugfs_remove_file(hpriv);
474
475	mutex_destroy(lock: &hpriv->ctx_lock);
476	mutex_destroy(lock: &hpriv->restore_phase_mutex);
477
478	/ There should be no memory buffers at this point and handles IDR can be destroyed /
479	hl_mem_mgr_idr_destroy(mmg: &hpriv->mem_mgr);
480
481	/ Device should be reset if reset-upon-device-release is enabled, or if there is a pending*
482	* reset that waits for device release.
483	*/
484	reset_device = hdev->reset_upon_device_release \|\| hdev->reset_info.watchdog_active;
485
486	/ Check the device idle status and reset if not idle.*
487	* Skip it if already in reset, or if device is going to be reset in any case.
488	*/
489	if (!hdev->reset_info.in_reset && !reset_device && !hdev->pldm)
490	device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
491	HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL);
492	if (!device_is_idle) {
493	print_idle_status_mask(hdev, message: "device is not idle after user context is closed",
494	idle_mask);
495	reset_device = true;
496	}
497
498	/ We need to remove the user from the list to make sure the reset process won't*
499	* try to kill the user process. Because, if we got here, it means there are no
500	* more driver/device resources that the user process is occupying so there is
501	* no need to kill it
502	*
503	* However, we can't set the compute_ctx to NULL at this stage. This is to prevent
504	* a race between the release and opening the device again. We don't want to let
505	* a user open the device while there a reset is about to happen.
506	*/
507	mutex_lock(&hdev->fpriv_list_lock);
508	list_del(entry: &hpriv->dev_node);
509	mutex_unlock(lock: &hdev->fpriv_list_lock);
510
511	put_pid(pid: hpriv->taskpid);
512
513	if (reset_device) {
514	hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE);
515	} else {
516	/ Scrubbing is handled within hl_device_reset(), so here need to do it directly /
517	int rc = hdev->asic_funcs->scrub_device_mem(hdev);
518
519	if (rc) {
520	dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc);
521	hl_device_reset(hdev, HL_DRV_RESET_HARD);
522	}
523	}
524
525	/ Now we can mark the compute_ctx as not active. Even if a reset is running in a different*
526	* thread, we don't care because the in_reset is marked so if a user will try to open
527	* the device it will fail on that, even if compute_ctx is false.
528	*/
529	mutex_lock(&hdev->fpriv_list_lock);
530	hdev->is_compute_ctx_active = false;
531	mutex_unlock(lock: &hdev->fpriv_list_lock);
532
533	hdev->compute_ctx_in_release = `0`;
534
535	/ release the eventfd /
536	if (hpriv->notifier_event.eventfd)
537	eventfd_ctx_put(ctx: hpriv->notifier_event.eventfd);
538
539	mutex_destroy(lock: &hpriv->notifier_event.lock);
540
541	kfree(objp: hpriv);
542	}
543
544	void hl_hpriv_get(struct hl_fpriv *hpriv)
545	{
546	kref_get(kref: &hpriv->refcount);
547	}
548
549	int hl_hpriv_put(struct hl_fpriv *hpriv)
550	{
551	return kref_put(kref: &hpriv->refcount, release: hpriv_release);
552	}
553
554	static void print_device_in_use_info(struct hl_device *hdev,
555	struct hl_mem_mgr_fini_stats mm_fini_stats, const* char *message)
556	{
557	u32 active_cs_num, dmabuf_export_cnt;
558	bool unknown_reason = true;
559	char buf[`128`];
560	size_t size;
561	int offset;
562
563	size = sizeof(buf);
564	offset = `0`;
565
566	active_cs_num = hl_get_active_cs_num(hdev);
567	if (active_cs_num) {
568	unknown_reason = false;
569	offset += scnprintf(buf: buf + offset, size: size - offset, fmt: " [%u active CS]", active_cs_num);
570	}
571
572	dmabuf_export_cnt = atomic_read(v: &hdev->dmabuf_export_cnt);
573	if (dmabuf_export_cnt) {
574	unknown_reason = false;
575	offset += scnprintf(buf: buf + offset, size: size - offset, fmt: " [%u exported dma-buf]",
576	dmabuf_export_cnt);
577	}
578
579	if (mm_fini_stats->n_busy_cb) {
580	unknown_reason = false;
581	offset += scnprintf(buf: buf + offset, size: size - offset, fmt: " [%u live CB handles]",
582	mm_fini_stats->n_busy_cb);
583	}
584
585	if (unknown_reason)
586	scnprintf(buf: buf + offset, size: size - offset, fmt: " [unknown reason]");
587
588	dev_notice(hdev->dev, "%s%s\n", message, buf);
589	}
590
591	/*
592	* hl_device_release() - release function for habanalabs device.
593	* @ddev: pointer to DRM device structure.
594	* @file: pointer to DRM file private data structure.
595	*
596	* Called when process closes an habanalabs device
597	*/
598	void hl_device_release(struct drm_device ddev, struct* drm_file *file_priv)
599	{
600	struct hl_fpriv *hpriv = file_priv->driver_priv;
601	struct hl_device *hdev = to_hl_device(ddev);
602	struct hl_mem_mgr_fini_stats mm_fini_stats;
603
604	if (!hdev) {
605	pr_crit("Closing FD after device was removed. Memory leak will occur and it is advised to reboot.\n");
606	put_pid(pid: hpriv->taskpid);
607	}
608
609	hl_ctx_mgr_fini(hdev, mgr: &hpriv->ctx_mgr);
610
611	/ Memory buffers might be still in use at this point and thus the handles IDR destruction*
612	* is postponed to hpriv_release().
613	*/
614	hl_mem_mgr_fini(mmg: &hpriv->mem_mgr, stats: &mm_fini_stats);
615
616	hdev->compute_ctx_in_release = `1`;
617
618	if (!hl_hpriv_put(hpriv)) {
619	print_device_in_use_info(hdev, mm_fini_stats: &mm_fini_stats,
620	message: "User process closed FD but device still in use");
621	hl_device_reset(hdev, HL_DRV_RESET_HARD);
622	}
623
624	hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif;
625	}
626
627	static int hl_device_release_ctrl(struct inode inode, struct* file *filp)
628	{
629	struct hl_fpriv *hpriv = filp->private_data;
630	struct hl_device *hdev = hpriv->hdev;
631
632	filp->private_data = NULL;
633
634	if (!hdev) {
635	pr_err("Closing FD after device was removed\n");
636	goto out;
637	}
638
639	mutex_lock(&hdev->fpriv_ctrl_list_lock);
640	list_del(entry: &hpriv->dev_node);
641	mutex_unlock(lock: &hdev->fpriv_ctrl_list_lock);
642	out:
643	put_pid(pid: hpriv->taskpid);
644
645	kfree(objp: hpriv);
646
647	return `0`;
648	}
649
650	static int __hl_mmap(struct hl_fpriv hpriv, struct* vm_area_struct *vma)
651	{
652	struct hl_device *hdev = hpriv->hdev;
653	unsigned long vm_pgoff;
654
655	if (!hdev) {
656	pr_err_ratelimited("Trying to mmap after device was removed! Please close FD\n");
657	return -ENODEV;
658	}
659
660	vm_pgoff = vma->vm_pgoff;
661
662	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
663	case HL_MMAP_TYPE_BLOCK:
664	vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
665	return hl_hw_block_mmap(hpriv, vma);
666
667	case HL_MMAP_TYPE_CB:
668	case HL_MMAP_TYPE_TS_BUFF:
669	return hl_mem_mgr_mmap(mmg: &hpriv->mem_mgr, vma, NULL);
670	}
671	return -EINVAL;
672	}
673
674	/*
675	* hl_mmap - mmap function for habanalabs device
676	*
677	* @*filp: pointer to file structure
678	* @*vma: pointer to vm_area_struct of the process
679	*
680	* Called when process does an mmap on habanalabs device. Call the relevant mmap
681	* function at the end of the common code.
682	*/
683	int hl_mmap(struct file filp, struct* vm_area_struct *vma)
684	{
685	struct drm_file *file_priv = filp->private_data;
686	struct hl_fpriv *hpriv = file_priv->driver_priv;
687
688	return __hl_mmap(hpriv, vma);
689	}
690
691	static const struct file_operations hl_ctrl_ops = {
692	.owner = THIS_MODULE,
693	.open = hl_device_open_ctrl,
694	.release = hl_device_release_ctrl,
695	.unlocked_ioctl = hl_ioctl_control,
696	.compat_ioctl = hl_ioctl_control
697	};
698
699	static void device_release_func(struct device *dev)
700	{
701	kfree(objp: dev);
702	}
703
704	/*
705	* device_init_cdev - Initialize cdev and device for habanalabs device
706	*
707	* @hdev: pointer to habanalabs device structure
708	* @class: pointer to the class object of the device
709	* @minor: minor number of the specific device
710	* @fops: file operations to install for this device
711	* @name: name of the device as it will appear in the filesystem
712	* @cdev: pointer to the char device object that will be initialized
713	* @dev: pointer to the device object that will be initialized
714	*
715	* Initialize a cdev and a Linux device for habanalabs's device.
716	*/
717	static int device_init_cdev(struct hl_device hdev, const* struct class *class,
718	int minor, const struct file_operations *fops,
719	char name, struct* cdev *cdev,
720	struct device **dev)
721	{
722	cdev_init(cdev, fops);
723	cdev->owner = THIS_MODULE;
724
725	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
726	if (!*dev)
727	return -ENOMEM;
728
729	device_initialize(dev: *dev);
730	(*dev)->devt = MKDEV(hdev->major, minor);
731	(*dev)->class = class;
732	(*dev)->release = device_release_func;
733	dev_set_drvdata(dev: *dev, data: hdev);
734	dev_set_name(dev: *dev, name: "%s", name);
735
736	return `0`;
737	}
738
739	static int cdev_sysfs_debugfs_add(struct hl_device *hdev)
740	{
741	const struct class *accel_class = hdev->drm.accel->kdev->class;
742	char name[`32`];
743	int rc;
744
745	hdev->cdev_idx = hdev->drm.accel->index;
746
747	/ Initialize cdev and device structures for the control device /
748	snprintf(buf: name, size: sizeof(name), fmt: "accel_controlD%d", hdev->cdev_idx);
749	rc = device_init_cdev(hdev, class: accel_class, minor: hdev->cdev_idx, fops: &hl_ctrl_ops, name,
750	cdev: &hdev->cdev_ctrl, dev: &hdev->dev_ctrl);
751	if (rc)
752	return rc;
753
754	rc = cdev_device_add(cdev: &hdev->cdev_ctrl, dev: hdev->dev_ctrl);
755	if (rc) {
756	dev_err(hdev->dev_ctrl,
757	"failed to add an accel control char device to the system\n");
758	goto free_ctrl_device;
759	}
760
761	rc = hl_sysfs_init(hdev);
762	if (rc) {
763	dev_err(hdev->dev, "failed to initialize sysfs\n");
764	goto delete_ctrl_cdev_device;
765	}
766
767	hl_debugfs_add_device(hdev);
768
769	hdev->cdev_sysfs_debugfs_created = true;
770
771	return `0`;
772
773	delete_ctrl_cdev_device:
774	cdev_device_del(cdev: &hdev->cdev_ctrl, dev: hdev->dev_ctrl);
775	free_ctrl_device:
776	put_device(dev: hdev->dev_ctrl);
777	return rc;
778	}
779
780	static void cdev_sysfs_debugfs_remove(struct hl_device *hdev)
781	{
782	if (!hdev->cdev_sysfs_debugfs_created)
783	return;
784
785	hl_sysfs_fini(hdev);
786
787	cdev_device_del(cdev: &hdev->cdev_ctrl, dev: hdev->dev_ctrl);
788	put_device(dev: hdev->dev_ctrl);
789	}
790
791	static void device_hard_reset_pending(struct work_struct *work)
792	{
793	struct hl_device_reset_work *device_reset_work =
794	container_of(work, struct hl_device_reset_work, reset_work.work);
795	struct hl_device *hdev = device_reset_work->hdev;
796	u32 flags;
797	int rc;
798
799	flags = device_reset_work->flags \| HL_DRV_RESET_FROM_RESET_THR;
800
801	rc = hl_device_reset(hdev, flags);
802
803	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
804	struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
805
806	if (ctx) {
807	/ The read refcount value should subtracted by one, because the read is*
808	* protected with hl_get_compute_ctx().
809	*/
810	dev_info(hdev->dev,
811	"Could not reset device (compute_ctx refcount %u). will try again in %u seconds",
812	kref_read(&ctx->refcount) - `1`, HL_PENDING_RESET_PER_SEC);
813	hl_ctx_put(ctx);
814	} else {
815	dev_info(hdev->dev, "Could not reset device. will try again in %u seconds",
816	HL_PENDING_RESET_PER_SEC);
817	}
818
819	queue_delayed_work(wq: hdev->reset_wq, dwork: &device_reset_work->reset_work,
820	secs_to_jiffies(HL_PENDING_RESET_PER_SEC));
821	}
822	}
823
824	static void device_release_watchdog_func(struct work_struct *work)
825	{
826	struct hl_device_reset_work *watchdog_work =
827	container_of(work, struct hl_device_reset_work, reset_work.work);
828	struct hl_device *hdev = watchdog_work->hdev;
829	u32 flags;
830
831	dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n");
832
833	flags = watchdog_work->flags \| HL_DRV_RESET_HARD \| HL_DRV_RESET_FROM_WD_THR;
834
835	hl_device_reset(hdev, flags);
836	}
837
838	/*
839	* device_early_init - do some early initialization for the habanalabs device
840	*
841	* @hdev: pointer to habanalabs device structure
842	*
843	* Install the relevant function pointers and call the early_init function,
844	* if such a function exists
845	*/
846	static int device_early_init(struct hl_device *hdev)
847	{
848	int i, rc;
849	char workq_name[`32`];
850
851	switch (hdev->asic_type) {
852	case ASIC_GOYA:
853	goya_set_asic_funcs(hdev);
854	strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
855	break;
856	case ASIC_GAUDI:
857	gaudi_set_asic_funcs(hdev);
858	strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name));
859	break;
860	case ASIC_GAUDI_SEC:
861	gaudi_set_asic_funcs(hdev);
862	strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name));
863	break;
864	case ASIC_GAUDI2:
865	gaudi2_set_asic_funcs(hdev);
866	strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name));
867	break;
868	case ASIC_GAUDI2B:
869	gaudi2_set_asic_funcs(hdev);
870	strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
871	break;
872	case ASIC_GAUDI2C:
873	gaudi2_set_asic_funcs(hdev);
874	strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name));
875	break;
876	case ASIC_GAUDI2D:
877	gaudi2_set_asic_funcs(hdev);
878	strscpy(hdev->asic_name, "GAUDI2D", sizeof(hdev->asic_name));
879	break;
880	default:
881	dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
882	hdev->asic_type);
883	return -EINVAL;
884	}
885
886	rc = hdev->asic_funcs->early_init(hdev);
887	if (rc)
888	return rc;
889
890	rc = hl_asid_init(hdev);
891	if (rc)
892	goto early_fini;
893
894	if (hdev->asic_prop.completion_queues_count) {
895	hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
896	sizeof(struct workqueue_struct *),
897	GFP_KERNEL);
898	if (!hdev->cq_wq) {
899	rc = -ENOMEM;
900	goto asid_fini;
901	}
902	}
903
904	for (i = `0` ; i < hdev->asic_prop.completion_queues_count ; i++) {
905	snprintf(buf: workq_name, size: `32`, fmt: "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
906	hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
907	if (hdev->cq_wq[i] == NULL) {
908	dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
909	rc = -ENOMEM;
910	goto free_cq_wq;
911	}
912	}
913
914	snprintf(buf: workq_name, size: `32`, fmt: "hl%u-events", hdev->cdev_idx);
915	hdev->eq_wq = create_singlethread_workqueue(workq_name);
916	if (hdev->eq_wq == NULL) {
917	dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
918	rc = -ENOMEM;
919	goto free_cq_wq;
920	}
921
922	snprintf(buf: workq_name, size: `32`, fmt: "hl%u-cs-completions", hdev->cdev_idx);
923	hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, `0`);
924	if (!hdev->cs_cmplt_wq) {
925	dev_err(hdev->dev,
926	"Failed to allocate CS completions workqueue\n");
927	rc = -ENOMEM;
928	goto free_eq_wq;
929	}
930
931	snprintf(buf: workq_name, size: `32`, fmt: "hl%u-ts-free-obj", hdev->cdev_idx);
932	hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, `0`);
933	if (!hdev->ts_free_obj_wq) {
934	dev_err(hdev->dev,
935	"Failed to allocate Timestamp registration free workqueue\n");
936	rc = -ENOMEM;
937	goto free_cs_cmplt_wq;
938	}
939
940	snprintf(buf: workq_name, size: `32`, fmt: "hl%u-prefetch", hdev->cdev_idx);
941	hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, `0`);
942	if (!hdev->prefetch_wq) {
943	dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
944	rc = -ENOMEM;
945	goto free_ts_free_wq;
946	}
947
948	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
949	if (!hdev->hl_chip_info) {
950	rc = -ENOMEM;
951	goto free_prefetch_wq;
952	}
953
954	rc = hl_mmu_if_set_funcs(hdev);
955	if (rc)
956	goto free_chip_info;
957
958	hl_mem_mgr_init(dev: hdev->dev, mmg: &hdev->kernel_mem_mgr);
959
960	snprintf(buf: workq_name, size: `32`, fmt: "hl%u_device_reset", hdev->cdev_idx);
961	hdev->reset_wq = create_singlethread_workqueue(workq_name);
962	if (!hdev->reset_wq) {
963	rc = -ENOMEM;
964	dev_err(hdev->dev, "Failed to create device reset WQ\n");
965	goto free_cb_mgr;
966	}
967
968	INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
969
970	INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending);
971	hdev->device_reset_work.hdev = hdev;
972	hdev->device_fini_pending = `0`;
973
974	INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work,
975	device_release_watchdog_func);
976	hdev->device_release_watchdog_work.hdev = hdev;
977
978	mutex_init(&hdev->send_cpu_message_lock);
979	mutex_init(&hdev->debug_lock);
980	INIT_LIST_HEAD(list: &hdev->cs_mirror_list);
981	spin_lock_init(&hdev->cs_mirror_lock);
982	spin_lock_init(&hdev->reset_info.lock);
983	INIT_LIST_HEAD(list: &hdev->fpriv_list);
984	INIT_LIST_HEAD(list: &hdev->fpriv_ctrl_list);
985	mutex_init(&hdev->fpriv_list_lock);
986	mutex_init(&hdev->fpriv_ctrl_list_lock);
987	mutex_init(&hdev->clk_throttling.lock);
988
989	return `0`;
990
991	free_cb_mgr:
992	hl_mem_mgr_fini(mmg: &hdev->kernel_mem_mgr, NULL);
993	hl_mem_mgr_idr_destroy(mmg: &hdev->kernel_mem_mgr);
994	free_chip_info:
995	kfree(objp: hdev->hl_chip_info);
996	free_prefetch_wq:
997	destroy_workqueue(wq: hdev->prefetch_wq);
998	free_ts_free_wq:
999	destroy_workqueue(wq: hdev->ts_free_obj_wq);
1000	free_cs_cmplt_wq:
1001	destroy_workqueue(wq: hdev->cs_cmplt_wq);
1002	free_eq_wq:
1003	destroy_workqueue(wq: hdev->eq_wq);
1004	free_cq_wq:
1005	for (i = `0` ; i < hdev->asic_prop.completion_queues_count ; i++)
1006	if (hdev->cq_wq[i])
1007	destroy_workqueue(wq: hdev->cq_wq[i]);
1008	kfree(objp: hdev->cq_wq);
1009	asid_fini:
1010	hl_asid_fini(hdev);
1011	early_fini:
1012	if (hdev->asic_funcs->early_fini)
1013	hdev->asic_funcs->early_fini(hdev);
1014
1015	return rc;
1016	}
1017
1018	/*
1019	* device_early_fini - finalize all that was done in device_early_init
1020	*
1021	* @hdev: pointer to habanalabs device structure
1022	*
1023	*/
1024	static void device_early_fini(struct hl_device *hdev)
1025	{
1026	int i;
1027
1028	mutex_destroy(lock: &hdev->debug_lock);
1029	mutex_destroy(lock: &hdev->send_cpu_message_lock);
1030
1031	mutex_destroy(lock: &hdev->fpriv_list_lock);
1032	mutex_destroy(lock: &hdev->fpriv_ctrl_list_lock);
1033
1034	mutex_destroy(lock: &hdev->clk_throttling.lock);
1035
1036	hl_mem_mgr_fini(mmg: &hdev->kernel_mem_mgr, NULL);
1037	hl_mem_mgr_idr_destroy(mmg: &hdev->kernel_mem_mgr);
1038
1039	kfree(objp: hdev->hl_chip_info);
1040
1041	destroy_workqueue(wq: hdev->prefetch_wq);
1042	destroy_workqueue(wq: hdev->ts_free_obj_wq);
1043	destroy_workqueue(wq: hdev->cs_cmplt_wq);
1044	destroy_workqueue(wq: hdev->eq_wq);
1045	destroy_workqueue(wq: hdev->reset_wq);
1046
1047	for (i = `0` ; i < hdev->asic_prop.completion_queues_count ; i++)
1048	destroy_workqueue(wq: hdev->cq_wq[i]);
1049	kfree(objp: hdev->cq_wq);
1050
1051	hl_asid_fini(hdev);
1052
1053	if (hdev->asic_funcs->early_fini)
1054	hdev->asic_funcs->early_fini(hdev);
1055	}
1056
1057	static bool is_pci_link_healthy(struct hl_device *hdev)
1058	{
1059	u16 device_id;
1060
1061	if (!hdev->pdev)
1062	return false;
1063
1064	pci_read_config_word(dev: hdev->pdev, PCI_DEVICE_ID, val: &device_id);
1065
1066	return (device_id == hdev->pdev->device);
1067	}
1068
1069	static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
1070	{
1071	struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info;
1072	u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << `1`) - `1`;
1073	struct asic_fixed_properties *prop = &hdev->asic_prop;
1074
1075	if (!prop->cpucp_info.eq_health_check_supported)
1076	return true;
1077
1078	if (!hdev->eq_heartbeat_received) {
1079	dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
1080
1081	dev_err(hdev->dev,
1082	"EQ: {CI %u, HB counter %u, last HB time: %ptTs}, PQ: {PI: %u, CI: %u (%u), last HB time: %ptTs}\n",
1083	hdev->event_queue.ci,
1084	heartbeat_debug_info->heartbeat_event_counter,
1085	&hdev->heartbeat_debug_info.last_eq_heartbeat_ts,
1086	hdev->kernel_queues[cpu_q_id].pi,
1087	atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
1088	atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask,
1089	&hdev->heartbeat_debug_info.last_pq_heartbeat_ts);
1090
1091	hl_eq_dump(hdev, q: &hdev->event_queue);
1092
1093	return false;
1094	}
1095
1096	hdev->eq_heartbeat_received = false;
1097
1098	return true;
1099	}
1100
1101	static void hl_device_heartbeat(struct work_struct *work)
1102	{
1103	struct hl_device hdev = container_of(work, struct* hl_device,
1104	work_heartbeat.work);
1105	struct hl_info_fw_err_info info = {`0`};
1106	u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET \| HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
1107
1108	/ Start heartbeat checks only after driver has enabled events from FW /
1109	if (!hl_device_operational(hdev, NULL) \|\| !hdev->init_done)
1110	goto reschedule;
1111
1112	/*
1113	* For EQ health check need to check if driver received the heartbeat eq event
1114	* in order to validate the eq is working.
1115	* Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
1116	*/
1117	if (hl_device_eq_heartbeat_received(hdev) && (!hdev->asic_funcs->send_heartbeat(hdev)))
1118	goto reschedule;
1119
1120	if (hl_device_operational(hdev, NULL))
1121	dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
1122	is_pci_link_healthy(hdev) ? "healthy" : "broken");
1123
1124	info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
1125	info.event_mask = &event_mask;
1126	hl_handle_fw_err(hdev, info: &info);
1127	hl_device_cond_reset(hdev, HL_DRV_RESET_HARD \| HL_DRV_RESET_HEARTBEAT, event_mask);
1128
1129	return;
1130
1131	reschedule:
1132	/*
1133	* prev_reset_trigger tracks consecutive fatal h/w errors until first
1134	* heartbeat immediately post reset.
1135	* If control reached here, then at least one heartbeat work has been
1136	* scheduled since last reset/init cycle.
1137	* So if the device is not already in reset cycle, reset the flag
1138	* prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR
1139	* status for at least one heartbeat. From this point driver restarts
1140	* tracking future consecutive fatal errors.
1141	*/
1142	if (!hdev->reset_info.in_reset)
1143	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
1144
1145	schedule_delayed_work(dwork: &hdev->work_heartbeat,
1146	delay: usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
1147	}
1148
1149	/*
1150	* device_late_init - do late stuff initialization for the habanalabs device
1151	*
1152	* @hdev: pointer to habanalabs device structure
1153	*
1154	* Do stuff that either needs the device H/W queues to be active or needs
1155	* to happen after all the rest of the initialization is finished
1156	*/
1157	static int device_late_init(struct hl_device *hdev)
1158	{
1159	int rc;
1160
1161	if (hdev->asic_funcs->late_init) {
1162	rc = hdev->asic_funcs->late_init(hdev);
1163	if (rc) {
1164	dev_err(hdev->dev,
1165	"failed late initialization for the H/W\n");
1166	return rc;
1167	}
1168	}
1169
1170	hdev->high_pll = hdev->asic_prop.high_pll;
1171	hdev->late_init_done = true;
1172
1173	return `0`;
1174	}
1175
1176	/*
1177	* device_late_fini - finalize all that was done in device_late_init
1178	*
1179	* @hdev: pointer to habanalabs device structure
1180	*
1181	*/
1182	static void device_late_fini(struct hl_device *hdev)
1183	{
1184	if (!hdev->late_init_done)
1185	return;
1186
1187	if (hdev->asic_funcs->late_fini)
1188	hdev->asic_funcs->late_fini(hdev);
1189
1190	hdev->late_init_done = false;
1191	}
1192
1193	int hl_device_utilization(struct hl_device hdev, u32 utilization)
1194	{
1195	u64 max_power, curr_power, dc_power, dividend, divisor;
1196	int rc;
1197
1198	max_power = hdev->max_power;
1199	dc_power = hdev->asic_prop.dc_power_default;
1200	divisor = max_power - dc_power;
1201	if (!divisor) {
1202	dev_warn(hdev->dev, "device utilization is not supported\n");
1203	return -EOPNOTSUPP;
1204	}
1205	rc = hl_fw_cpucp_power_get(hdev, power: &curr_power);
1206
1207	if (rc)
1208	return rc;
1209
1210	curr_power = clamp(curr_power, dc_power, max_power);
1211
1212	dividend = (curr_power - dc_power) * `100`;
1213	*utilization = (u32) div_u64(dividend, divisor);
1214
1215	return `0`;
1216	}
1217
1218	int hl_device_set_debug_mode(struct hl_device hdev, struct* hl_ctx *ctx, bool enable)
1219	{
1220	int rc = `0`;
1221
1222	mutex_lock(&hdev->debug_lock);
1223
1224	if (!enable) {
1225	if (!hdev->in_debug) {
1226	dev_err(hdev->dev,
1227	"Failed to disable debug mode because device was not in debug mode\n");
1228	rc = -EFAULT;
1229	goto out;
1230	}
1231
1232	if (!hdev->reset_info.hard_reset_pending)
1233	hdev->asic_funcs->halt_coresight(hdev, ctx);
1234
1235	hdev->in_debug = `0`;
1236
1237	goto out;
1238	}
1239
1240	if (hdev->in_debug) {
1241	dev_err(hdev->dev,
1242	"Failed to enable debug mode because device is already in debug mode\n");
1243	rc = -EFAULT;
1244	goto out;
1245	}
1246
1247	hdev->in_debug = `1`;
1248
1249	out:
1250	mutex_unlock(lock: &hdev->debug_lock);
1251
1252	return rc;
1253	}
1254
1255	static void take_release_locks(struct hl_device *hdev)
1256	{
1257	/ Flush anyone that is inside the critical section of enqueue*
1258	* jobs to the H/W
1259	*/
1260	hdev->asic_funcs->hw_queues_lock(hdev);
1261	hdev->asic_funcs->hw_queues_unlock(hdev);
1262
1263	/ Flush processes that are sending message to CPU /
1264	mutex_lock(&hdev->send_cpu_message_lock);
1265	mutex_unlock(lock: &hdev->send_cpu_message_lock);
1266
1267	/ Flush anyone that is inside device open /
1268	mutex_lock(&hdev->fpriv_list_lock);
1269	mutex_unlock(lock: &hdev->fpriv_list_lock);
1270	mutex_lock(&hdev->fpriv_ctrl_list_lock);
1271	mutex_unlock(lock: &hdev->fpriv_ctrl_list_lock);
1272	}
1273
1274	static void hl_abort_waiting_for_completions(struct hl_device *hdev)
1275	{
1276	hl_abort_waiting_for_cs_completions(hdev);
1277
1278	/ Release all pending user interrupts, each pending user interrupt*
1279	* holds a reference to a user context.
1280	*/
1281	hl_release_pending_user_interrupts(hdev);
1282	}
1283
1284	static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset,
1285	bool skip_wq_flush)
1286	{
1287	if (hard_reset) {
1288	if (hdev->heartbeat)
1289	cancel_delayed_work_sync(dwork: &hdev->work_heartbeat);
1290
1291	device_late_fini(hdev);
1292	}
1293
1294	/*
1295	* Halt the engines and disable interrupts so we won't get any more
1296	* completions from H/W and we won't have any accesses from the
1297	* H/W to the host machine
1298	*/
1299	hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
1300
1301	/ Go over all the queues, release all CS and their jobs /
1302	hl_cs_rollback_all(hdev, skip_wq_flush);
1303
1304	/ flush the MMU prefetch workqueue /
1305	flush_workqueue(hdev->prefetch_wq);
1306
1307	hl_abort_waiting_for_completions(hdev);
1308	}
1309
1310	/*
1311	* hl_device_suspend - initiate device suspend
1312	*
1313	* @hdev: pointer to habanalabs device structure
1314	*
1315	* Puts the hw in the suspend state (all asics).
1316	* Returns 0 for success or an error on failure.
1317	* Called at driver suspend.
1318	*/
1319	int hl_device_suspend(struct hl_device *hdev)
1320	{
1321	int rc;
1322
1323	pci_save_state(dev: hdev->pdev);
1324
1325	/ Block future CS/VM/JOB completion operations /
1326	spin_lock(lock: &hdev->reset_info.lock);
1327	if (hdev->reset_info.in_reset) {
1328	spin_unlock(lock: &hdev->reset_info.lock);
1329	dev_err(hdev->dev, "Can't suspend while in reset\n");
1330	return -EIO;
1331	}
1332	hdev->reset_info.in_reset = `1`;
1333	spin_unlock(lock: &hdev->reset_info.lock);
1334
1335	/ This blocks all other stuff that is not blocked by in_reset /
1336	hdev->disabled = true;
1337
1338	take_release_locks(hdev);
1339
1340	rc = hdev->asic_funcs->suspend(hdev);
1341	if (rc)
1342	dev_err(hdev->dev,
1343	"Failed to disable PCI access of device CPU\n");
1344
1345	/ Shut down the device /
1346	pci_disable_device(dev: hdev->pdev);
1347	pci_set_power_state(dev: hdev->pdev, PCI_D3hot);
1348
1349	return `0`;
1350	}
1351
1352	/*
1353	* hl_device_resume - initiate device resume
1354	*
1355	* @hdev: pointer to habanalabs device structure
1356	*
1357	* Bring the hw back to operating state (all asics).
1358	* Returns 0 for success or an error on failure.
1359	* Called at driver resume.
1360	*/
1361	int hl_device_resume(struct hl_device *hdev)
1362	{
1363	int rc;
1364
1365	pci_set_power_state(dev: hdev->pdev, PCI_D0);
1366	pci_restore_state(dev: hdev->pdev);
1367	rc = pci_enable_device_mem(dev: hdev->pdev);
1368	if (rc) {
1369	dev_err(hdev->dev,
1370	"Failed to enable PCI device in resume\n");
1371	return rc;
1372	}
1373
1374	pci_set_master(dev: hdev->pdev);
1375
1376	rc = hdev->asic_funcs->resume(hdev);
1377	if (rc) {
1378	dev_err(hdev->dev, "Failed to resume device after suspend\n");
1379	goto disable_device;
1380	}
1381
1382
1383	/ 'in_reset' was set to true during suspend, now we must clear it in order*
1384	* for hard reset to be performed
1385	*/
1386	spin_lock(lock: &hdev->reset_info.lock);
1387	hdev->reset_info.in_reset = `0`;
1388	spin_unlock(lock: &hdev->reset_info.lock);
1389
1390	rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
1391	if (rc) {
1392	dev_err(hdev->dev, "Failed to reset device during resume\n");
1393	goto disable_device;
1394	}
1395
1396	return `0`;
1397
1398	disable_device:
1399	pci_disable_device(dev: hdev->pdev);
1400
1401	return rc;
1402	}
1403
1404	static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev)
1405	{
1406	struct task_struct *task = NULL;
1407	struct list_head *hpriv_list;
1408	struct hl_fpriv *hpriv;
1409	struct mutex *hpriv_lock;
1410	u32 pending_cnt;
1411
1412	hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
1413	hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
1414
1415	/ Giving time for user to close FD, and for processes that are inside*
1416	* hl_device_open to finish
1417	*/
1418	if (!list_empty(head: hpriv_list))
1419	ssleep(seconds: `1`);
1420
1421	if (timeout) {
1422	pending_cnt = timeout;
1423	} else {
1424	if (hdev->process_kill_trial_cnt) {
1425	/ Processes have been already killed /
1426	pending_cnt = `1`;
1427	goto wait_for_processes;
1428	} else {
1429	/ Wait a small period after process kill /
1430	pending_cnt = HL_PENDING_RESET_PER_SEC;
1431	}
1432	}
1433
1434	mutex_lock(hpriv_lock);
1435
1436	/ This section must be protected because we are dereferencing*
1437	* pointers that are freed if the process exits
1438	*/
1439	list_for_each_entry(hpriv, hpriv_list, dev_node) {
1440	task = get_pid_task(pid: hpriv->taskpid, PIDTYPE_PID);
1441	if (task) {
1442	dev_info(hdev->dev, "Killing user process pid=%d\n",
1443	task_pid_nr(task));
1444	send_sig(SIGKILL, task, `1`);
1445	usleep_range(min: `1000`, max: `10000`);
1446
1447	put_task_struct(t: task);
1448	} else {
1449	dev_dbg(hdev->dev,
1450	"Can't get task struct for user process %d, process was killed from outside the driver\n",
1451	pid_nr(hpriv->taskpid));
1452	}
1453	}
1454
1455	mutex_unlock(lock: hpriv_lock);
1456
1457	/*
1458	* We killed the open users, but that doesn't mean they are closed.
1459	* It could be that they are running a long cleanup phase in the driver
1460	* e.g. MMU unmappings, or running other long teardown flow even before
1461	* our cleanup.
1462	* Therefore we need to wait again to make sure they are closed before
1463	* continuing with the reset.
1464	*/
1465
1466	wait_for_processes:
1467	while ((!list_empty(head: hpriv_list)) && (pending_cnt)) {
1468	dev_dbg(hdev->dev,
1469	"Waiting for all unmap operations to finish before hard reset\n");
1470
1471	pending_cnt--;
1472
1473	ssleep(seconds: `1`);
1474	}
1475
1476	/ All processes exited successfully /
1477	if (list_empty(head: hpriv_list))
1478	return `0`;
1479
1480	/ Give up waiting for processes to exit /
1481	if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS)
1482	return -ETIME;
1483
1484	hdev->process_kill_trial_cnt++;
1485
1486	return -EBUSY;
1487	}
1488
1489	static void device_disable_open_processes(struct hl_device *hdev, bool control_dev)
1490	{
1491	struct list_head *hpriv_list;
1492	struct hl_fpriv *hpriv;
1493	struct mutex *hpriv_lock;
1494
1495	hpriv_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
1496	hpriv_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
1497
1498	mutex_lock(hpriv_lock);
1499	list_for_each_entry(hpriv, hpriv_list, dev_node)
1500	hpriv->hdev = NULL;
1501	mutex_unlock(lock: hpriv_lock);
1502	}
1503
1504	static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
1505	{
1506	/ If reset is due to heartbeat, device CPU is no responsive in*
1507	* which case no point sending PCI disable message to it.
1508	*/
1509	if ((flags & HL_DRV_RESET_HARD) &&
1510	!(flags & (HL_DRV_RESET_HEARTBEAT \| HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
1511	/ Disable PCI access from device F/W so he won't send*
1512	* us additional interrupts. We disable MSI/MSI-X at
1513	* the halt_engines function and we can't have the F/W
1514	* sending us interrupts after that. We need to disable
1515	* the access here because if the device is marked
1516	* disable, the message won't be send. Also, in case
1517	* of heartbeat, the device CPU is marked as disable
1518	* so this message won't be sent
1519	*/
1520	if (hl_fw_send_pci_access_msg(hdev, opcode: CPUCP_PACKET_DISABLE_PCI_ACCESS, value: `0x0`))
1521	return;
1522
1523	/ disable_irq also generates sync irq, this verifies that last EQs are handled*
1524	* before disabled is set. The IRQ will be enabled again in request_irq call.
1525	*/
1526	if (hdev->cpu_queues_enable)
1527	disable_irq(irq: pci_irq_vector(dev: hdev->pdev, nr: hdev->asic_prop.eq_interrupt_id));
1528	}
1529	}
1530
1531	static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
1532	{
1533	u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
1534
1535	/ No consecutive mechanism when user context exists /
1536	if (hdev->is_compute_ctx_active)
1537	return;
1538
1539	/*
1540	* 'reset cause' is being updated here, because getting here
1541	* means that it's the 1st time and the last time we're here
1542	* ('in_reset' makes sure of it). This makes sure that
1543	* 'reset_cause' will continue holding its 1st recorded reason!
1544	*/
1545	if (flags & HL_DRV_RESET_HEARTBEAT) {
1546	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
1547	cur_reset_trigger = HL_DRV_RESET_HEARTBEAT;
1548	} else if (flags & HL_DRV_RESET_TDR) {
1549	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR;
1550	cur_reset_trigger = HL_DRV_RESET_TDR;
1551	} else if (flags & HL_DRV_RESET_FW_FATAL_ERR) {
1552	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
1553	cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR;
1554	} else {
1555	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
1556	}
1557
1558	/*
1559	* If reset cause is same twice, then reset_trigger_repeated
1560	* is set and if this reset is due to a fatal FW error
1561	* device is set to an unstable state.
1562	*/
1563	if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) {
1564	hdev->reset_info.prev_reset_trigger = cur_reset_trigger;
1565	hdev->reset_info.reset_trigger_repeated = `0`;
1566	} else {
1567	hdev->reset_info.reset_trigger_repeated = `1`;
1568	}
1569	}
1570
1571	static void reset_heartbeat_debug_info(struct hl_device *hdev)
1572	{
1573	hdev->heartbeat_debug_info.last_pq_heartbeat_ts = `0`;
1574	hdev->heartbeat_debug_info.last_eq_heartbeat_ts = `0`;
1575	hdev->heartbeat_debug_info.heartbeat_event_counter = `0`;
1576	}
1577
1578	static inline void device_heartbeat_schedule(struct hl_device *hdev)
1579	{
1580	if (!hdev->heartbeat)
1581	return;
1582
1583	reset_heartbeat_debug_info(hdev);
1584
1585	/*
1586	* Before scheduling the heartbeat driver will check if eq event has received.
1587	* for the first schedule we need to set the indication as true then for the next
1588	* one this indication will be true only if eq event was sent by FW.
1589	*/
1590	hdev->eq_heartbeat_received = true;
1591
1592	schedule_delayed_work(dwork: &hdev->work_heartbeat,
1593	delay: usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
1594	}
1595
1596	/*
1597	* hl_device_reset - reset the device
1598	*
1599	* @hdev: pointer to habanalabs device structure
1600	* @flags: reset flags.
1601	*
1602	* Block future CS and wait for pending CS to be enqueued
1603	* Call ASIC H/W fini
1604	* Flush all completions
1605	* Re-initialize all internal data structures
1606	* Call ASIC H/W init, late_init
1607	* Test queues
1608	* Enable device
1609	*
1610	* Returns 0 for success or an error on failure.
1611	*/
1612	int hl_device_reset(struct hl_device *hdev, u32 flags)
1613	{
1614	bool hard_reset, from_hard_reset_thread, fw_reset, reset_upon_device_release,
1615	schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread;
1616	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {`0`};
1617	struct hl_ctx *ctx;
1618	int i, rc, hw_fini_rc;
1619
1620	if (!hdev->init_done) {
1621	dev_err(hdev->dev, "Can't reset before initialization is done\n");
1622	return `0`;
1623	}
1624
1625	hard_reset = !!(flags & HL_DRV_RESET_HARD);
1626	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
1627	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
1628	from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE);
1629	delay_reset = !!(flags & HL_DRV_RESET_DELAY);
1630	from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR);
1631	reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
1632
1633	if (hdev->cpld_shutdown) {
1634	dev_err(hdev->dev, "Cannot reset device, cpld is shutdown! Device is NOT usable\n");
1635	return -EIO;
1636	}
1637
1638	if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) {
1639	dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
1640	return `0`;
1641	}
1642
1643	if (!hard_reset && !hdev->asic_prop.supports_compute_reset) {
1644	dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n");
1645	hard_reset = true;
1646	}
1647
1648	if (reset_upon_device_release) {
1649	if (hard_reset) {
1650	dev_crit(hdev->dev,
1651	"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
1652	return -EINVAL;
1653	}
1654
1655	goto do_reset;
1656	}
1657
1658	if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
1659	dev_dbg(hdev->dev,
1660	"asic doesn't allow inference soft reset - do hard-reset instead\n");
1661	hard_reset = true;
1662	}
1663
1664	do_reset:
1665	/ Re-entry of reset thread /
1666	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
1667	goto kill_processes;
1668
1669	/*
1670	* Prevent concurrency in this function - only one reset should be
1671	* done at any given time. We need to perform this only if we didn't
1672	* get here from a dedicated hard reset thread.
1673	*/
1674	if (!from_hard_reset_thread) {
1675	/ Block future CS/VM/JOB completion operations /
1676	spin_lock(lock: &hdev->reset_info.lock);
1677	if (hdev->reset_info.in_reset) {
1678	/ We allow scheduling of a hard reset only during a compute reset /
1679	if (hard_reset && hdev->reset_info.in_compute_reset)
1680	hdev->reset_info.hard_reset_schedule_flags = flags;
1681	spin_unlock(lock: &hdev->reset_info.lock);
1682	return `0`;
1683	}
1684
1685	/ This still allows the completion of some KDMA ops*
1686	* Update this before in_reset because in_compute_reset implies we are in reset
1687	*/
1688	hdev->reset_info.in_compute_reset = !hard_reset;
1689
1690	hdev->reset_info.in_reset = `1`;
1691
1692	spin_unlock(lock: &hdev->reset_info.lock);
1693
1694	/ Cancel the device release watchdog work if required.*
1695	* In case of reset-upon-device-release while the release watchdog work is
1696	* scheduled due to a hard-reset, do hard-reset instead of compute-reset.
1697	*/
1698	if ((hard_reset \|\| from_dev_release) && hdev->reset_info.watchdog_active) {
1699	struct hl_device_reset_work *watchdog_work =
1700	&hdev->device_release_watchdog_work;
1701
1702	hdev->reset_info.watchdog_active = `0`;
1703	if (!from_watchdog_thread)
1704	cancel_delayed_work_sync(dwork: &watchdog_work->reset_work);
1705
1706	if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) {
1707	hdev->reset_info.in_compute_reset = `0`;
1708	flags \|= HL_DRV_RESET_HARD;
1709	flags &= ~HL_DRV_RESET_DEV_RELEASE;
1710	hard_reset = true;
1711	}
1712	}
1713
1714	if (delay_reset)
1715	usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << `1`);
1716
1717	escalate_reset_flow:
1718	handle_reset_trigger(hdev, flags);
1719	send_disable_pci_access(hdev, flags);
1720
1721	/ This also blocks future CS/VM/JOB completion operations /
1722	hdev->disabled = true;
1723
1724	take_release_locks(hdev);
1725
1726	if (hard_reset)
1727	dev_info(hdev->dev, "Going to reset device\n");
1728	else if (reset_upon_device_release)
1729	dev_dbg(hdev->dev, "Going to reset device after release by user\n");
1730	else
1731	dev_dbg(hdev->dev, "Going to reset engines of inference device\n");
1732	}
1733
1734	if ((hard_reset) && (!from_hard_reset_thread)) {
1735	hdev->reset_info.hard_reset_pending = true;
1736
1737	hdev->process_kill_trial_cnt = `0`;
1738
1739	hdev->device_reset_work.flags = flags;
1740
1741	/*
1742	* Because the reset function can't run from heartbeat work,
1743	* we need to call the reset function from a dedicated work.
1744	*/
1745	queue_delayed_work(wq: hdev->reset_wq, dwork: &hdev->device_reset_work.reset_work, delay: `0`);
1746
1747	return `0`;
1748	}
1749
1750	cleanup_resources(hdev, hard_reset, fw_reset, skip_wq_flush: from_dev_release);
1751
1752	kill_processes:
1753	if (hard_reset) {
1754	/ Kill processes here after CS rollback. This is because the*
1755	* process can't really exit until all its CSs are done, which
1756	* is what we do in cs rollback
1757	*/
1758	rc = device_kill_open_processes(hdev, timeout: `0`, control_dev: false);
1759
1760	if (rc == -EBUSY) {
1761	if (hdev->device_fini_pending) {
1762	dev_crit(hdev->dev,
1763	"%s Failed to kill all open processes, stopping hard reset\n",
1764	dev_name(&(hdev)->pdev->dev));
1765	goto out_err;
1766	}
1767
1768	/ signal reset thread to reschedule /
1769	return rc;
1770	}
1771
1772	if (rc) {
1773	dev_crit(hdev->dev,
1774	"%s Failed to kill all open processes, stopping hard reset\n",
1775	dev_name(&(hdev)->pdev->dev));
1776	goto out_err;
1777	}
1778
1779	/ Flush the Event queue workers to make sure no other thread is*
1780	* reading or writing to registers during the reset
1781	*/
1782	flush_workqueue(hdev->eq_wq);
1783	}
1784
1785	/ Reset the H/W. It will be in idle state after this returns /
1786	hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
1787
1788	if (hard_reset) {
1789	hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
1790
1791	/ Release kernel context /
1792	if (hdev->kernel_ctx && hl_ctx_put(ctx: hdev->kernel_ctx) == `1`)
1793	hdev->kernel_ctx = NULL;
1794
1795	hl_vm_fini(hdev);
1796	hl_mmu_fini(hdev);
1797	hl_eq_reset(hdev, q: &hdev->event_queue);
1798	}
1799
1800	/ Re-initialize PI,CI to 0 in all queues (hw queue, cq) /
1801	hl_hw_queue_reset(hdev, hard_reset);
1802	for (i = `0` ; i < hdev->asic_prop.completion_queues_count ; i++)
1803	hl_cq_reset(hdev, q: &hdev->completion_queue[i]);
1804
1805	/ Make sure the context switch phase will run again /
1806	ctx = hl_get_compute_ctx(hdev);
1807	if (ctx) {
1808	atomic_set(v: &ctx->thread_ctx_switch_token, i: `1`);
1809	ctx->thread_ctx_switch_wait_token = `0`;
1810	hl_ctx_put(ctx);
1811	}
1812
1813	if (hw_fini_rc) {
1814	rc = hw_fini_rc;
1815	goto out_err;
1816	}
1817	/ Finished tear-down, starting to re-initialize /
1818
1819	if (hard_reset) {
1820	hdev->device_cpu_disabled = false;
1821	hdev->reset_info.hard_reset_pending = false;
1822
1823	/*
1824	* Put the device in an unusable state if there are 2 back to back resets due to
1825	* fatal errors.
1826	*/
1827	if (hdev->reset_info.reset_trigger_repeated &&
1828	(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR \|\|
1829	hdev->reset_info.prev_reset_trigger ==
1830	HL_DRV_RESET_HEARTBEAT)) {
1831	dev_crit(hdev->dev,
1832	"%s Consecutive fatal errors, stopping hard reset\n",
1833	dev_name(&(hdev)->pdev->dev));
1834	rc = -EIO;
1835	goto out_err;
1836	}
1837
1838	if (hdev->kernel_ctx) {
1839	dev_crit(hdev->dev,
1840	"%s kernel ctx was alive during hard reset, something is terribly wrong\n",
1841	dev_name(&(hdev)->pdev->dev));
1842	rc = -EBUSY;
1843	goto out_err;
1844	}
1845
1846	rc = hl_mmu_init(hdev);
1847	if (rc) {
1848	dev_err(hdev->dev,
1849	"Failed to initialize MMU S/W after hard reset\n");
1850	goto out_err;
1851	}
1852
1853	/ Allocate the kernel context /
1854	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
1855	GFP_KERNEL);
1856	if (!hdev->kernel_ctx) {
1857	rc = -ENOMEM;
1858	hl_mmu_fini(hdev);
1859	goto out_err;
1860	}
1861
1862	hdev->is_compute_ctx_active = false;
1863
1864	rc = hl_ctx_init(hdev, ctx: hdev->kernel_ctx, is_kernel_ctx: true);
1865	if (rc) {
1866	dev_err(hdev->dev,
1867	"failed to init kernel ctx in hard reset\n");
1868	kfree(objp: hdev->kernel_ctx);
1869	hdev->kernel_ctx = NULL;
1870	hl_mmu_fini(hdev);
1871	goto out_err;
1872	}
1873	}
1874
1875	/ Device is now enabled as part of the initialization requires*
1876	* communication with the device firmware to get information that
1877	* is required for the initialization itself
1878	*/
1879	hdev->disabled = false;
1880
1881	/ F/W security enabled indication might be updated after hard-reset /
1882	if (hard_reset) {
1883	rc = hl_fw_read_preboot_status(hdev);
1884	if (rc)
1885	goto out_err;
1886	}
1887
1888	rc = hdev->asic_funcs->hw_init(hdev);
1889	if (rc) {
1890	dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
1891	goto out_err;
1892	}
1893
1894	/ If device is not idle fail the reset process /
1895	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
1896	HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
1897	print_idle_status_mask(hdev, message: "device is not idle after reset", idle_mask);
1898	rc = -EIO;
1899	goto out_err;
1900	}
1901
1902	/ Check that the communication with the device is working /
1903	rc = hdev->asic_funcs->test_queues(hdev);
1904	if (rc) {
1905	dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
1906	goto out_err;
1907	}
1908
1909	if (hard_reset) {
1910	rc = device_late_init(hdev);
1911	if (rc) {
1912	dev_err(hdev->dev, "Failed late init after hard reset\n");
1913	goto out_err;
1914	}
1915
1916	rc = hl_vm_init(hdev);
1917	if (rc) {
1918	dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
1919	goto out_err;
1920	}
1921
1922	if (!hdev->asic_prop.fw_security_enabled)
1923	hl_fw_set_max_power(hdev);
1924	} else {
1925	rc = hdev->asic_funcs->compute_reset_late_init(hdev);
1926	if (rc) {
1927	if (reset_upon_device_release)
1928	dev_err(hdev->dev,
1929	"Failed late init in reset after device release\n");
1930	else
1931	dev_err(hdev->dev, "Failed late init after compute reset\n");
1932	goto out_err;
1933	}
1934	}
1935
1936	rc = hdev->asic_funcs->scrub_device_mem(hdev);
1937	if (rc) {
1938	dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc);
1939	goto out_err;
1940	}
1941
1942	spin_lock(lock: &hdev->reset_info.lock);
1943	hdev->reset_info.in_compute_reset = `0`;
1944
1945	/ Schedule hard reset only if requested and if not already in hard reset.*
1946	* We keep 'in_reset' enabled, so no other reset can go in during the hard
1947	* reset schedule
1948	*/
1949	if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
1950	schedule_hard_reset = true;
1951	else
1952	hdev->reset_info.in_reset = `0`;
1953
1954	spin_unlock(lock: &hdev->reset_info.lock);
1955
1956	hdev->reset_info.needs_reset = false;
1957
1958	if (hard_reset)
1959	dev_info(hdev->dev,
1960	"Successfully finished resetting the %s device\n",
1961	dev_name(&(hdev)->pdev->dev));
1962	else
1963	dev_dbg(hdev->dev,
1964	"Successfully finished resetting the %s device\n",
1965	dev_name(&(hdev)->pdev->dev));
1966
1967	if (hard_reset) {
1968	hdev->reset_info.hard_reset_cnt++;
1969
1970	device_heartbeat_schedule(hdev);
1971
1972	/ After reset is done, we are ready to receive events from*
1973	* the F/W. We can't do it before because we will ignore events
1974	* and if those events are fatal, we won't know about it and
1975	* the device will be operational although it shouldn't be
1976	*/
1977	hdev->asic_funcs->enable_events_from_fw(hdev);
1978	} else {
1979	if (!reset_upon_device_release)
1980	hdev->reset_info.compute_reset_cnt++;
1981
1982	if (schedule_hard_reset) {
1983	dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
1984	flags = hdev->reset_info.hard_reset_schedule_flags;
1985	hdev->reset_info.hard_reset_schedule_flags = `0`;
1986	hard_reset = true;
1987	goto escalate_reset_flow;
1988	}
1989	}
1990
1991	return `0`;
1992
1993	out_err:
1994	hdev->disabled = true;
1995
1996	spin_lock(lock: &hdev->reset_info.lock);
1997	hdev->reset_info.in_compute_reset = `0`;
1998
1999	if (hard_reset) {
2000	dev_err(hdev->dev,
2001	"%s Failed to reset! Device is NOT usable\n",
2002	dev_name(&(hdev)->pdev->dev));
2003	hdev->reset_info.hard_reset_cnt++;
2004	} else {
2005	if (reset_upon_device_release) {
2006	dev_err(hdev->dev, "Failed to reset device after user release\n");
2007	flags &= ~HL_DRV_RESET_DEV_RELEASE;
2008	} else {
2009	dev_err(hdev->dev, "Failed to do compute reset\n");
2010	hdev->reset_info.compute_reset_cnt++;
2011	}
2012
2013	spin_unlock(lock: &hdev->reset_info.lock);
2014	flags \|= HL_DRV_RESET_HARD;
2015	hard_reset = true;
2016	goto escalate_reset_flow;
2017	}
2018
2019	hdev->reset_info.in_reset = `0`;
2020
2021	spin_unlock(lock: &hdev->reset_info.lock);
2022
2023	return rc;
2024	}
2025
2026	/*
2027	* hl_device_cond_reset() - conditionally reset the device.
2028	* @hdev: pointer to habanalabs device structure.
2029	* @reset_flags: reset flags.
2030	* @event_mask: events to notify user about.
2031	*
2032	* Conditionally reset the device, or alternatively schedule a watchdog work to reset the device
2033	* unless another reset precedes it.
2034	*/
2035	int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
2036	{
2037	struct hl_ctx *ctx = NULL;
2038
2039	/ F/W reset cannot be postponed /
2040	if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW)
2041	goto device_reset;
2042
2043	/ Device release watchdog is relevant only if user exists and gets a reset notification /
2044	if (!(event_mask & HL_NOTIFIER_EVENT_DEVICE_RESET)) {
2045	dev_err(hdev->dev, "Resetting device without a reset indication to user\n");
2046	goto device_reset;
2047	}
2048
2049	ctx = hl_get_compute_ctx(hdev);
2050	if (!ctx)
2051	goto device_reset;
2052
2053	/*
2054	* There is no point in postponing the reset if user is not registered for events.
2055	* However if no eventfd_ctx exists but the device release watchdog is already scheduled, it
2056	* just implies that user has unregistered as part of handling a previous event. In this
2057	* case an immediate reset is not required.
2058	*/
2059	if (!ctx->hpriv->notifier_event.eventfd && !hdev->reset_info.watchdog_active)
2060	goto device_reset;
2061
2062	/ Schedule the device release watchdog work unless reset is already in progress or if the*
2063	* work is already scheduled.
2064	*/
2065	spin_lock(lock: &hdev->reset_info.lock);
2066	if (hdev->reset_info.in_reset) {
2067	spin_unlock(lock: &hdev->reset_info.lock);
2068	goto device_reset;
2069	}
2070
2071	if (hdev->reset_info.watchdog_active) {
2072	hdev->device_release_watchdog_work.flags \|= flags;
2073	goto out;
2074	}
2075
2076	hdev->device_release_watchdog_work.flags = flags;
2077	dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n",
2078	hdev->device_release_watchdog_timeout_sec);
2079	schedule_delayed_work(dwork: &hdev->device_release_watchdog_work.reset_work,
2080	secs_to_jiffies(hdev->device_release_watchdog_timeout_sec));
2081	hdev->reset_info.watchdog_active = `1`;
2082	out:
2083	spin_unlock(lock: &hdev->reset_info.lock);
2084
2085	hl_notifier_event_send_all(hdev, event_mask);
2086
2087	hl_ctx_put(ctx);
2088
2089	hl_abort_waiting_for_completions(hdev);
2090
2091	return `0`;
2092
2093	device_reset:
2094	if (event_mask)
2095	hl_notifier_event_send_all(hdev, event_mask);
2096	if (ctx)
2097	hl_ctx_put(ctx);
2098
2099	return hl_device_reset(hdev, flags: flags \| HL_DRV_RESET_HARD);
2100	}
2101
2102	static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask)
2103	{
2104	mutex_lock(&notifier_event->lock);
2105	notifier_event->events_mask \|= event_mask;
2106
2107	if (notifier_event->eventfd)
2108	eventfd_signal(ctx: notifier_event->eventfd);
2109
2110	mutex_unlock(lock: &notifier_event->lock);
2111	}
2112
2113	/*
2114	* hl_notifier_event_send_all - notify all user processes via eventfd
2115	*
2116	* @hdev: pointer to habanalabs device structure
2117	* @event_mask: the occurred event/s
2118	* Returns 0 for success or an error on failure.
2119	*/
2120	void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask)
2121	{
2122	struct hl_fpriv *hpriv;
2123
2124	if (!event_mask) {
2125	dev_warn(hdev->dev, "Skip sending zero event");
2126	return;
2127	}
2128
2129	mutex_lock(&hdev->fpriv_list_lock);
2130
2131	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
2132	hl_notifier_event_send(notifier_event: &hpriv->notifier_event, event_mask);
2133
2134	mutex_unlock(lock: &hdev->fpriv_list_lock);
2135	}
2136
2137	/*
2138	* hl_device_init - main initialization function for habanalabs device
2139	*
2140	* @hdev: pointer to habanalabs device structure
2141	*
2142	* Allocate an id for the device, do early initialization and then call the
2143	* ASIC specific initialization functions. Finally, create the cdev and the
2144	* Linux device to expose it to the user
2145	*/
2146	int hl_device_init(struct hl_device *hdev)
2147	{
2148	int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt;
2149	struct hl_ts_free_jobs *free_jobs_data;
2150	bool expose_interfaces_on_err = false;
2151	void *p;
2152
2153	/ Initialize ASIC function pointers and perform early init /
2154	rc = device_early_init(hdev);
2155	if (rc)
2156	goto out_disabled;
2157
2158	user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
2159	hdev->asic_prop.user_interrupt_count;
2160
2161	if (user_interrupt_cnt) {
2162	hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt),
2163	GFP_KERNEL);
2164	if (!hdev->user_interrupt) {
2165	rc = -ENOMEM;
2166	goto early_fini;
2167	}
2168
2169	/ Timestamp records supported only if CQ supported in device /
2170	if (hdev->asic_prop.first_available_cq[`0`] != USHRT_MAX) {
2171	for (i = `0` ; i < user_interrupt_cnt ; i++) {
2172	p = vzalloc(TIMESTAMP_FREE_NODES_NUM *
2173	sizeof(struct timestamp_reg_free_node));
2174	if (!p) {
2175	rc = -ENOMEM;
2176	goto free_usr_intr_mem;
2177	}
2178	free_jobs_data = &hdev->user_interrupt[i].ts_free_jobs_data;
2179	free_jobs_data->free_nodes_pool = p;
2180	free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM;
2181	free_jobs_data->next_avail_free_node_idx = `0`;
2182	}
2183	}
2184	}
2185
2186	free_jobs_data = &hdev->common_user_cq_interrupt.ts_free_jobs_data;
2187	p = vzalloc(TIMESTAMP_FREE_NODES_NUM *
2188	sizeof(struct timestamp_reg_free_node));
2189	if (!p) {
2190	rc = -ENOMEM;
2191	goto free_usr_intr_mem;
2192	}
2193
2194	free_jobs_data->free_nodes_pool = p;
2195	free_jobs_data->free_nodes_length = TIMESTAMP_FREE_NODES_NUM;
2196	free_jobs_data->next_avail_free_node_idx = `0`;
2197
2198	/*
2199	* Start calling ASIC initialization. First S/W then H/W and finally
2200	* late init
2201	*/
2202	rc = hdev->asic_funcs->sw_init(hdev);
2203	if (rc)
2204	goto free_common_usr_intr_mem;
2205
2206
2207	/ initialize completion structure for multi CS wait /
2208	hl_multi_cs_completion_init(hdev);
2209
2210	/*
2211	* Initialize the H/W queues. Must be done before hw_init, because
2212	* there the addresses of the kernel queue are being written to the
2213	* registers of the device
2214	*/
2215	rc = hl_hw_queues_create(hdev);
2216	if (rc) {
2217	dev_err(hdev->dev, "failed to initialize kernel queues\n");
2218	goto sw_fini;
2219	}
2220
2221	cq_cnt = hdev->asic_prop.completion_queues_count;
2222
2223	/*
2224	* Initialize the completion queues. Must be done before hw_init,
2225	* because there the addresses of the completion queues are being
2226	* passed as arguments to request_irq
2227	*/
2228	if (cq_cnt) {
2229	hdev->completion_queue = kcalloc(cq_cnt,
2230	sizeof(*hdev->completion_queue),
2231	GFP_KERNEL);
2232
2233	if (!hdev->completion_queue) {
2234	dev_err(hdev->dev,
2235	"failed to allocate completion queues\n");
2236	rc = -ENOMEM;
2237	goto hw_queues_destroy;
2238	}
2239	}
2240
2241	for (i = `0`, cq_ready_cnt = `0` ; i < cq_cnt ; i++, cq_ready_cnt++) {
2242	rc = hl_cq_init(hdev, q: &hdev->completion_queue[i],
2243	hw_queue_id: hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
2244	if (rc) {
2245	dev_err(hdev->dev,
2246	"failed to initialize completion queue\n");
2247	goto cq_fini;
2248	}
2249	hdev->completion_queue[i].cq_idx = i;
2250	}
2251
2252	hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs,
2253	sizeof(struct hl_cs *), GFP_KERNEL);
2254	if (!hdev->shadow_cs_queue) {
2255	rc = -ENOMEM;
2256	goto cq_fini;
2257	}
2258
2259	/*
2260	* Initialize the event queue. Must be done before hw_init,
2261	* because there the address of the event queue is being
2262	* passed as argument to request_irq
2263	*/
2264	rc = hl_eq_init(hdev, q: &hdev->event_queue);
2265	if (rc) {
2266	dev_err(hdev->dev, "failed to initialize event queue\n");
2267	goto free_shadow_cs_queue;
2268	}
2269
2270	/ MMU S/W must be initialized before kernel context is created /
2271	rc = hl_mmu_init(hdev);
2272	if (rc) {
2273	dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
2274	goto eq_fini;
2275	}
2276
2277	/ Allocate the kernel context /
2278	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
2279	if (!hdev->kernel_ctx) {
2280	rc = -ENOMEM;
2281	goto mmu_fini;
2282	}
2283
2284	hdev->is_compute_ctx_active = false;
2285
2286	hdev->asic_funcs->state_dump_init(hdev);
2287
2288	hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC;
2289
2290	hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL;
2291
2292	rc = hl_debugfs_device_init(hdev);
2293	if (rc) {
2294	dev_err(hdev->dev, "failed to initialize debugfs entry structure\n");
2295	kfree(objp: hdev->kernel_ctx);
2296	goto mmu_fini;
2297	}
2298
2299	/ The debugfs entry structure is accessed in hl_ctx_init(), so it must be called after*
2300	* hl_debugfs_device_init().
2301	*/
2302	rc = hl_ctx_init(hdev, ctx: hdev->kernel_ctx, is_kernel_ctx: true);
2303	if (rc) {
2304	dev_err(hdev->dev, "failed to initialize kernel context\n");
2305	kfree(objp: hdev->kernel_ctx);
2306	goto debugfs_device_fini;
2307	}
2308
2309	rc = hl_cb_pool_init(hdev);
2310	if (rc) {
2311	dev_err(hdev->dev, "failed to initialize CB pool\n");
2312	goto release_ctx;
2313	}
2314
2315	rc = hl_dec_init(hdev);
2316	if (rc) {
2317	dev_err(hdev->dev, "Failed to initialize the decoder module\n");
2318	goto cb_pool_fini;
2319	}
2320
2321	/*
2322	* From this point, override rc (=0) in case of an error to allow debugging
2323	* (by adding char devices and creating sysfs/debugfs files as part of the error flow).
2324	*/
2325	expose_interfaces_on_err = true;
2326
2327	/ Device is now enabled as part of the initialization requires*
2328	* communication with the device firmware to get information that
2329	* is required for the initialization itself
2330	*/
2331	hdev->disabled = false;
2332
2333	rc = hdev->asic_funcs->hw_init(hdev);
2334	if (rc) {
2335	dev_err(hdev->dev, "failed to initialize the H/W\n");
2336	rc = `0`;
2337	goto out_disabled;
2338	}
2339
2340	/ Check that the communication with the device is working /
2341	rc = hdev->asic_funcs->test_queues(hdev);
2342	if (rc) {
2343	dev_err(hdev->dev, "Failed to detect if device is alive\n");
2344	rc = `0`;
2345	goto out_disabled;
2346	}
2347
2348	rc = device_late_init(hdev);
2349	if (rc) {
2350	dev_err(hdev->dev, "Failed late initialization\n");
2351	rc = `0`;
2352	goto out_disabled;
2353	}
2354
2355	dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
2356	hdev->asic_name,
2357	hdev->asic_prop.dram_size / SZ_1G);
2358
2359	rc = hl_vm_init(hdev);
2360	if (rc) {
2361	dev_err(hdev->dev, "Failed to initialize memory module\n");
2362	rc = `0`;
2363	goto out_disabled;
2364	}
2365
2366	/*
2367	* Expose devices and sysfs/debugfs files to user.
2368	* From here there is no need to expose them in case of an error.
2369	*/
2370	expose_interfaces_on_err = false;
2371
2372	rc = drm_dev_register(dev: &hdev->drm, flags: `0`);
2373	if (rc) {
2374	dev_err(hdev->dev, "Failed to register DRM device, rc %d\n", rc);
2375	rc = `0`;
2376	goto out_disabled;
2377	}
2378
2379	rc = cdev_sysfs_debugfs_add(hdev);
2380	if (rc) {
2381	dev_err(hdev->dev, "Failed to add char devices and sysfs/debugfs files\n");
2382	rc = `0`;
2383	goto out_disabled;
2384	}
2385
2386	/ Need to call this again because the max power might change,*
2387	* depending on card type for certain ASICs
2388	*/
2389	if (hdev->asic_prop.set_max_power_on_device_init &&
2390	!hdev->asic_prop.fw_security_enabled)
2391	hl_fw_set_max_power(hdev);
2392
2393	/*
2394	* hl_hwmon_init() must be called after device_late_init(), because only
2395	* there we get the information from the device about which
2396	* hwmon-related sensors the device supports.
2397	* Furthermore, it must be done after adding the device to the system.
2398	*/
2399	rc = hl_hwmon_init(hdev);
2400	if (rc) {
2401	dev_err(hdev->dev, "Failed to initialize hwmon\n");
2402	rc = `0`;
2403	goto out_disabled;
2404	}
2405
2406	/ Scheduling the EQ heartbeat thread must come after driver is done with all*
2407	* initializations, as we want to make sure the FW gets enough time to be prepared
2408	* to respond to heartbeat packets.
2409	*/
2410	device_heartbeat_schedule(hdev);
2411
2412	dev_notice(hdev->dev,
2413	"Successfully added device %s to habanalabs driver\n",
2414	dev_name(&(hdev)->pdev->dev));
2415
2416	/ After initialization is done, we are ready to receive events from*
2417	* the F/W. We can't do it before because we will ignore events and if
2418	* those events are fatal, we won't know about it and the device will
2419	* be operational although it shouldn't be
2420	*/
2421	hdev->asic_funcs->enable_events_from_fw(hdev);
2422
2423	hdev->init_done = true;
2424
2425	return `0`;
2426
2427	cb_pool_fini:
2428	hl_cb_pool_fini(hdev);
2429	release_ctx:
2430	if (hl_ctx_put(ctx: hdev->kernel_ctx) != `1`)
2431	dev_err(hdev->dev,
2432	"kernel ctx is still alive on initialization failure\n");
2433	debugfs_device_fini:
2434	hl_debugfs_device_fini(hdev);
2435	mmu_fini:
2436	hl_mmu_fini(hdev);
2437	eq_fini:
2438	hl_eq_fini(hdev, q: &hdev->event_queue);
2439	free_shadow_cs_queue:
2440	kfree(objp: hdev->shadow_cs_queue);
2441	cq_fini:
2442	for (i = `0` ; i < cq_ready_cnt ; i++)
2443	hl_cq_fini(hdev, q: &hdev->completion_queue[i]);
2444	kfree(objp: hdev->completion_queue);
2445	hw_queues_destroy:
2446	hl_hw_queues_destroy(hdev);
2447	sw_fini:
2448	hdev->asic_funcs->sw_fini(hdev);
2449	free_common_usr_intr_mem:
2450	vfree(addr: hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool);
2451	free_usr_intr_mem:
2452	if (user_interrupt_cnt) {
2453	for (i = `0` ; i < user_interrupt_cnt ; i++) {
2454	if (!hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool)
2455	break;
2456	vfree(addr: hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool);
2457	}
2458	kfree(objp: hdev->user_interrupt);
2459	}
2460	early_fini:
2461	device_early_fini(hdev);
2462	out_disabled:
2463	hdev->disabled = true;
2464	if (expose_interfaces_on_err) {
2465	drm_dev_register(dev: &hdev->drm, flags: `0`);
2466	cdev_sysfs_debugfs_add(hdev);
2467	}
2468
2469	pr_err("Failed to initialize accel%d. Device %s is NOT usable!\n",
2470	hdev->cdev_idx, dev_name(&hdev->pdev->dev));
2471
2472	return rc;
2473	}
2474
2475	/*
2476	* hl_device_fini - main tear-down function for habanalabs device
2477	*
2478	* @hdev: pointer to habanalabs device structure
2479	*
2480	* Destroy the device, call ASIC fini functions and release the id
2481	*/
2482	void hl_device_fini(struct hl_device *hdev)
2483	{
2484	u32 user_interrupt_cnt;
2485	bool device_in_reset;
2486	ktime_t timeout;
2487	u64 reset_sec;
2488	int i, rc;
2489
2490	dev_info(hdev->dev, "Removing device %s\n", dev_name(&(hdev)->pdev->dev));
2491
2492	hdev->device_fini_pending = `1`;
2493	flush_delayed_work(dwork: &hdev->device_reset_work.reset_work);
2494
2495	if (hdev->pldm)
2496	reset_sec = HL_PLDM_HARD_RESET_MAX_TIMEOUT;
2497	else
2498	reset_sec = HL_HARD_RESET_MAX_TIMEOUT;
2499
2500	/*
2501	* This function is competing with the reset function, so try to
2502	* take the reset atomic and if we are already in middle of reset,
2503	* wait until reset function is finished. Reset function is designed
2504	* to always finish. However, in Gaudi, because of all the network
2505	* ports, the hard reset could take between 10-30 seconds
2506	*/
2507
2508	timeout = ktime_add_us(kt: ktime_get(), usec: reset_sec * `1000` * `1000`);
2509
2510	spin_lock(lock: &hdev->reset_info.lock);
2511	device_in_reset = !!hdev->reset_info.in_reset;
2512	if (!device_in_reset)
2513	hdev->reset_info.in_reset = `1`;
2514	spin_unlock(lock: &hdev->reset_info.lock);
2515
2516	while (device_in_reset) {
2517	usleep_range(min: `50`, max: `200`);
2518
2519	spin_lock(lock: &hdev->reset_info.lock);
2520	device_in_reset = !!hdev->reset_info.in_reset;
2521	if (!device_in_reset)
2522	hdev->reset_info.in_reset = `1`;
2523	spin_unlock(lock: &hdev->reset_info.lock);
2524
2525	if (ktime_compare(cmp1: ktime_get(), cmp2: timeout) > `0`) {
2526	dev_crit(hdev->dev,
2527	"%s Failed to remove device because reset function did not finish\n",
2528	dev_name(&(hdev)->pdev->dev));
2529	return;
2530	}
2531	}
2532
2533	cancel_delayed_work_sync(dwork: &hdev->device_release_watchdog_work.reset_work);
2534
2535	/ Disable PCI access from device F/W so it won't send us additional*
2536	* interrupts. We disable MSI/MSI-X at the halt_engines function and we
2537	* can't have the F/W sending us interrupts after that. We need to
2538	* disable the access here because if the device is marked disable, the
2539	* message won't be send. Also, in case of heartbeat, the device CPU is
2540	* marked as disable so this message won't be sent
2541	*/
2542	hl_fw_send_pci_access_msg(hdev, opcode: CPUCP_PACKET_DISABLE_PCI_ACCESS, value: `0x0`);
2543
2544	/ Mark device as disabled /
2545	hdev->disabled = true;
2546
2547	take_release_locks(hdev);
2548
2549	hdev->reset_info.hard_reset_pending = true;
2550
2551	hl_hwmon_fini(hdev);
2552
2553	cleanup_resources(hdev, hard_reset: true, fw_reset: false, skip_wq_flush: false);
2554
2555	/ Kill processes here after CS rollback. This is because the process*
2556	* can't really exit until all its CSs are done, which is what we
2557	* do in cs rollback
2558	*/
2559	dev_info(hdev->dev,
2560	"Waiting for all processes to exit (timeout of %u seconds)",
2561	HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI);
2562
2563	hdev->process_kill_trial_cnt = `0`;
2564	rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, control_dev: false);
2565	if (rc) {
2566	dev_crit(hdev->dev, "Failed to kill all open processes (%d)\n", rc);
2567	device_disable_open_processes(hdev, control_dev: false);
2568	}
2569
2570	hdev->process_kill_trial_cnt = `0`;
2571	rc = device_kill_open_processes(hdev, timeout: `0`, control_dev: true);
2572	if (rc) {
2573	dev_crit(hdev->dev, "Failed to kill all control device open processes (%d)\n", rc);
2574	device_disable_open_processes(hdev, control_dev: true);
2575	}
2576
2577	hl_cb_pool_fini(hdev);
2578
2579	/ Reset the H/W. It will be in idle state after this returns /
2580	rc = hdev->asic_funcs->hw_fini(hdev, true, false);
2581	if (rc)
2582	dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
2583
2584	/ Reset the H/W (if it accessible). It will be in idle state after this returns /
2585	if (!hdev->cpld_shutdown) {
2586	rc = hdev->asic_funcs->hw_fini(hdev, true, false);
2587	if (rc)
2588	dev_err(hdev->dev,
2589	"hw_fini failed in device fini while removing device %d\n", rc);
2590	}
2591
2592	hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
2593
2594	/ Release kernel context /
2595	if ((hdev->kernel_ctx) && (hl_ctx_put(ctx: hdev->kernel_ctx) != `1`))
2596	dev_err(hdev->dev, "kernel ctx is still alive\n");
2597
2598	hl_dec_fini(hdev);
2599
2600	hl_vm_fini(hdev);
2601
2602	hl_mmu_fini(hdev);
2603
2604	vfree(addr: hdev->captured_err_info.page_fault_info.user_mappings);
2605
2606	hl_eq_fini(hdev, q: &hdev->event_queue);
2607
2608	kfree(objp: hdev->shadow_cs_queue);
2609
2610	for (i = `0` ; i < hdev->asic_prop.completion_queues_count ; i++)
2611	hl_cq_fini(hdev, q: &hdev->completion_queue[i]);
2612	kfree(objp: hdev->completion_queue);
2613
2614	user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
2615	hdev->asic_prop.user_interrupt_count;
2616
2617	if (user_interrupt_cnt) {
2618	if (hdev->asic_prop.first_available_cq[`0`] != USHRT_MAX) {
2619	for (i = `0` ; i < user_interrupt_cnt ; i++)
2620	vfree(addr: hdev->user_interrupt[i].ts_free_jobs_data.free_nodes_pool);
2621	}
2622
2623	kfree(objp: hdev->user_interrupt);
2624	}
2625
2626	vfree(addr: hdev->common_user_cq_interrupt.ts_free_jobs_data.free_nodes_pool);
2627
2628	hl_hw_queues_destroy(hdev);
2629
2630	/ Call ASIC S/W finalize function /
2631	hdev->asic_funcs->sw_fini(hdev);
2632
2633	device_early_fini(hdev);
2634
2635	/ Hide devices and sysfs/debugfs files from user /
2636	cdev_sysfs_debugfs_remove(hdev);
2637	drm_dev_unregister(dev: &hdev->drm);
2638
2639	hl_debugfs_device_fini(hdev);
2640
2641	pr_info("removed device successfully\n");
2642	}
2643
2644	/*
2645	* MMIO register access helper functions.
2646	*/
2647
2648	/*
2649	* hl_rreg - Read an MMIO register
2650	*
2651	* @hdev: pointer to habanalabs device structure
2652	* @reg: MMIO register offset (in bytes)
2653	*
2654	* Returns the value of the MMIO register we are asked to read
2655	*
2656	*/
2657	inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
2658	{
2659	u32 val = readl(addr: hdev->rmmio + reg);
2660
2661	if (unlikely(trace_habanalabs_rreg32_enabled()))
2662	trace_habanalabs_rreg32(dev: &(hdev)->pdev->dev, addr: reg, val);
2663
2664	return val;
2665	}
2666
2667	/*
2668	* hl_wreg - Write to an MMIO register
2669	*
2670	* @hdev: pointer to habanalabs device structure
2671	* @reg: MMIO register offset (in bytes)
2672	* @val: 32-bit value
2673	*
2674	* Writes the 32-bit value into the MMIO register
2675	*
2676	*/
2677	inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
2678	{
2679	if (unlikely(trace_habanalabs_wreg32_enabled()))
2680	trace_habanalabs_wreg32(dev: &(hdev)->pdev->dev, addr: reg, val);
2681
2682	writel(val, addr: hdev->rmmio + reg);
2683	}
2684
2685	void hl_capture_razwi(struct hl_device hdev, u64 addr, u16 engine_id, u16 num_of_engines,
2686	u8 flags)
2687	{
2688	struct razwi_info *razwi_info = &hdev->captured_err_info.razwi_info;
2689
2690	if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) {
2691	dev_err(hdev->dev,
2692	"Number of possible razwi initiators (%u) exceeded limit (%u)\n",
2693	num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR);
2694	return;
2695	}
2696
2697	/ In case it's the first razwi since the device was opened, capture its parameters /
2698	if (atomic_cmpxchg(v: &hdev->captured_err_info.razwi_info.razwi_detected, old: `0`, new: `1`))
2699	return;
2700
2701	razwi_info->razwi.timestamp = ktime_to_ns(kt: ktime_get());
2702	razwi_info->razwi.addr = addr;
2703	razwi_info->razwi.num_of_possible_engines = num_of_engines;
2704	memcpy(&razwi_info->razwi.engine_id[`0`], &engine_id[`0`],
2705	num_of_engines * sizeof(u16));
2706	razwi_info->razwi.flags = flags;
2707
2708	razwi_info->razwi_info_available = true;
2709	}
2710
2711	void hl_handle_razwi(struct hl_device hdev, u64 addr, u16 engine_id, u16 num_of_engines,
2712	u8 flags, u64 *event_mask)
2713	{
2714	hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags);
2715
2716	if (event_mask)
2717	*event_mask \|= HL_NOTIFIER_EVENT_RAZWI;
2718	}
2719
2720	static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
2721	{
2722	struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
2723	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
2724	struct hl_vm_hash_node *hnode;
2725	struct hl_userptr *userptr;
2726	enum vm_type *vm_type;
2727	struct hl_ctx *ctx;
2728	u32 map_idx = `0`;
2729	int i;
2730
2731	/ Reset previous session count/
2732	pgf_info->num_of_user_mappings = `0`;
2733
2734	ctx = hl_get_compute_ctx(hdev);
2735	if (!ctx) {
2736	dev_err(hdev->dev, "Can't get user context for user mappings\n");
2737	return;
2738	}
2739
2740	mutex_lock(&ctx->mem_hash_lock);
2741	hash_for_each(ctx->mem_hash, i, hnode, node) {
2742	vm_type = hnode->ptr;
2743	if (((*vm_type == VM_TYPE_USERPTR) && is_pmmu) \|\|
2744	((*vm_type == VM_TYPE_PHYS_PACK) && !is_pmmu))
2745	pgf_info->num_of_user_mappings++;
2746
2747	}
2748
2749	if (!pgf_info->num_of_user_mappings)
2750	goto finish;
2751
2752	/ In case we already allocated in previous session, need to release it before*
2753	* allocating new buffer.
2754	*/
2755	vfree(addr: pgf_info->user_mappings);
2756	pgf_info->user_mappings =
2757	vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping));
2758	if (!pgf_info->user_mappings) {
2759	pgf_info->num_of_user_mappings = `0`;
2760	goto finish;
2761	}
2762
2763	hash_for_each(ctx->mem_hash, i, hnode, node) {
2764	vm_type = hnode->ptr;
2765	if ((*vm_type == VM_TYPE_USERPTR) && (is_pmmu)) {
2766	userptr = hnode->ptr;
2767	pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
2768	pgf_info->user_mappings[map_idx].size = userptr->size;
2769	map_idx++;
2770	} else if ((*vm_type == VM_TYPE_PHYS_PACK) && (!is_pmmu)) {
2771	phys_pg_pack = hnode->ptr;
2772	pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr;
2773	pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size;
2774	map_idx++;
2775	}
2776	}
2777	finish:
2778	mutex_unlock(lock: &ctx->mem_hash_lock);
2779	hl_ctx_put(ctx);
2780	}
2781
2782	void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu)
2783	{
2784	struct page_fault_info *pgf_info = &hdev->captured_err_info.page_fault_info;
2785
2786	/ Capture only the first page fault /
2787	if (atomic_cmpxchg(v: &pgf_info->page_fault_detected, old: `0`, new: `1`))
2788	return;
2789
2790	pgf_info->page_fault.timestamp = ktime_to_ns(kt: ktime_get());
2791	pgf_info->page_fault.addr = addr;
2792	pgf_info->page_fault.engine_id = eng_id;
2793	hl_capture_user_mappings(hdev, is_pmmu);
2794
2795	pgf_info->page_fault_info_available = true;
2796	}
2797
2798	void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
2799	u64 *event_mask)
2800	{
2801	hl_capture_page_fault(hdev, addr, eng_id, is_pmmu);
2802
2803	if (event_mask)
2804	*event_mask \|= HL_NOTIFIER_EVENT_PAGE_FAULT;
2805	}
2806
2807	static void hl_capture_hw_err(struct hl_device *hdev, u16 event_id)
2808	{
2809	struct hw_err_info *info = &hdev->captured_err_info.hw_err;
2810
2811	/ Capture only the first HW err /
2812	if (atomic_cmpxchg(v: &info->event_detected, old: `0`, new: `1`))
2813	return;
2814
2815	info->event.timestamp = ktime_to_ns(kt: ktime_get());
2816	info->event.event_id = event_id;
2817
2818	info->event_info_available = true;
2819	}
2820
2821	void hl_handle_critical_hw_err(struct hl_device hdev, u16 event_id, u64 event_mask)
2822	{
2823	hl_capture_hw_err(hdev, event_id);
2824
2825	if (event_mask)
2826	*event_mask \|= HL_NOTIFIER_EVENT_CRITICL_HW_ERR;
2827	}
2828
2829	static void hl_capture_fw_err(struct hl_device hdev, struct* hl_info_fw_err_info *fw_info)
2830	{
2831	struct fw_err_info *info = &hdev->captured_err_info.fw_err;
2832
2833	/ Capture only the first FW error /
2834	if (atomic_cmpxchg(v: &info->event_detected, old: `0`, new: `1`))
2835	return;
2836
2837	info->event.timestamp = ktime_to_ns(kt: ktime_get());
2838	info->event.err_type = fw_info->err_type;
2839	if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
2840	info->event.event_id = fw_info->event_id;
2841
2842	info->event_info_available = true;
2843	}
2844
2845	void hl_handle_fw_err(struct hl_device hdev, struct* hl_info_fw_err_info *info)
2846	{
2847	hl_capture_fw_err(hdev, fw_info: info);
2848
2849	if (info->event_mask)
2850	*info->event_mask \|= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
2851	}
2852
2853	void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count)
2854	{
2855	struct engine_err_info *info = &hdev->captured_err_info.engine_err;
2856
2857	/ Capture only the first engine error /
2858	if (atomic_cmpxchg(v: &info->event_detected, old: `0`, new: `1`))
2859	return;
2860
2861	info->event.timestamp = ktime_to_ns(kt: ktime_get());
2862	info->event.engine_id = engine_id;
2863	info->event.error_count = error_count;
2864	info->event_info_available = true;
2865	}
2866
2867	void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
2868	{
2869	vfree(addr: captured_err_info->page_fault_info.user_mappings);
2870	memset(captured_err_info, `0`, sizeof(struct hl_error_info));
2871	atomic_set(v: &captured_err_info->cs_timeout.write_enable, i: `1`);
2872	captured_err_info->undef_opcode.write_enable = true;
2873	}
2874
2875	void hl_init_cpu_for_irq(struct hl_device *hdev)
2876	{
2877	#ifdef CONFIG_NUMA
2878	struct cpumask *available_mask = &hdev->irq_affinity_mask;
2879	int numa_node = hdev->pdev->dev.numa_node, i;
2880	static struct cpumask cpu_mask;
2881
2882	if (numa_node < `0`)
2883	return;
2884
2885	if (!cpumask_and(dstp: &cpu_mask, src1p: cpumask_of_node(node: numa_node), cpu_online_mask)) {
2886	dev_err(hdev->dev, "No available affinities in current numa node\n");
2887	return;
2888	}
2889
2890	/ Remove HT siblings /
2891	for_each_cpu(i, &cpu_mask)
2892	cpumask_set_cpu(cpu: cpumask_first(topology_sibling_cpumask(i)), dstp: available_mask);
2893	#endif
2894	}
2895
2896	void hl_set_irq_affinity(struct hl_device hdev, int* irq)
2897	{
2898	if (cpumask_empty(srcp: &hdev->irq_affinity_mask)) {
2899	dev_dbg(hdev->dev, "affinity mask is empty\n");
2900	return;
2901	}
2902
2903	if (irq_set_affinity_and_hint(irq, m: &hdev->irq_affinity_mask))
2904	dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
2905	}
2906
2907	void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
2908	{
2909	hdev->heartbeat_debug_info.heartbeat_event_counter++;
2910	hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds();
2911	hdev->eq_heartbeat_received = true;
2912	}
2913
2914	void hl_handle_clk_change_event(struct hl_device hdev, u16 event_type, u64 event_mask)
2915	{
2916	struct hl_clk_throttle *clk_throttle = &hdev->clk_throttling;
2917	ktime_t zero_time = ktime_set(secs: `0`, nsecs: `0`);
2918
2919	mutex_lock(&clk_throttle->lock);
2920
2921	switch (event_type) {
2922	case EQ_EVENT_POWER_EVT_START:
2923	clk_throttle->current_reason \|= HL_CLK_THROTTLE_POWER;
2924	clk_throttle->aggregated_reason \|= HL_CLK_THROTTLE_POWER;
2925	clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
2926	clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
2927	dev_dbg_ratelimited(hdev->dev, "Clock throttling due to power consumption\n");
2928	break;
2929
2930	case EQ_EVENT_POWER_EVT_END:
2931	clk_throttle->current_reason &= ~HL_CLK_THROTTLE_POWER;
2932	clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
2933	dev_dbg_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n");
2934	break;
2935
2936	case EQ_EVENT_THERMAL_EVT_START:
2937	clk_throttle->current_reason \|= HL_CLK_THROTTLE_THERMAL;
2938	clk_throttle->aggregated_reason \|= HL_CLK_THROTTLE_THERMAL;
2939	clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
2940	clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
2941	*event_mask \|= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
2942	dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n");
2943	break;
2944
2945	case EQ_EVENT_THERMAL_EVT_END:
2946	clk_throttle->current_reason &= ~HL_CLK_THROTTLE_THERMAL;
2947	clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
2948	*event_mask \|= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
2949	dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n");
2950	break;
2951
2952	default:
2953	dev_err(hdev->dev, "Received invalid clock change event %d\n", event_type);
2954	break;
2955	}
2956
2957	mutex_unlock(lock: &clk_throttle->lock);
2958	}
2959
2960	void hl_eq_cpld_shutdown_event_handle(struct hl_device hdev, u16 event_id, u64 event_mask)
2961	{
2962	hl_handle_critical_hw_err(hdev, event_id, event_mask);
2963	*event_mask \|= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
2964
2965	/ Avoid any new accesses to the H/W /
2966	hdev->disabled = true;
2967	hdev->cpld_shutdown = true;
2968	}
2969

source code of linux/drivers/accel/habanalabs/common/device.c