aboutsummaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/common/habanalabs.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/common/habanalabs.h')
-rw-r--r--drivers/misc/habanalabs/common/habanalabs.h301
1 files changed, 207 insertions, 94 deletions
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index a2002cbf794b..cb710fd478b6 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0
*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
* All Rights Reserved.
*
*/
@@ -61,6 +61,8 @@
#define HL_CPUCP_INFO_TIMEOUT_USEC 10000000 /* 10s */
#define HL_CPUCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */
+#define HL_FW_STATUS_POLL_INTERVAL_USEC 10000 /* 10ms */
+
#define HL_PCI_ELBI_TIMEOUT_MSEC 10 /* 10ms */
#define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */
@@ -117,37 +119,37 @@ enum hl_mmu_page_table_location {
/*
* Reset Flags
*
- * - HL_RESET_HARD
+ * - HL_DRV_RESET_HARD
* If set do hard reset to all engines. If not set reset just
* compute/DMA engines.
*
- * - HL_RESET_FROM_RESET_THREAD
+ * - HL_DRV_RESET_FROM_RESET_THR
* Set if the caller is the hard-reset thread
*
- * - HL_RESET_HEARTBEAT
+ * - HL_DRV_RESET_HEARTBEAT
* Set if reset is due to heartbeat
*
- * - HL_RESET_TDR
+ * - HL_DRV_RESET_TDR
* Set if reset is due to TDR
*
- * - HL_RESET_DEVICE_RELEASE
+ * - HL_DRV_RESET_DEV_RELEASE
* Set if reset is due to device release
*
- * - HL_RESET_FW
+ * - HL_DRV_RESET_BYPASS_REQ_TO_FW
* F/W will perform the reset. No need to ask it to reset the device. This is relevant
* only when running with secured f/w
*
- * - HL_RESET_FW_FATAL_ERR
+ * - HL_DRV_RESET_FW_FATAL_ERR
* Set if reset is due to a fatal error from FW
*/
-#define HL_RESET_HARD (1 << 0)
-#define HL_RESET_FROM_RESET_THREAD (1 << 1)
-#define HL_RESET_HEARTBEAT (1 << 2)
-#define HL_RESET_TDR (1 << 3)
-#define HL_RESET_DEVICE_RELEASE (1 << 4)
-#define HL_RESET_FW (1 << 5)
-#define HL_RESET_FW_FATAL_ERR (1 << 6)
+#define HL_DRV_RESET_HARD (1 << 0)
+#define HL_DRV_RESET_FROM_RESET_THR (1 << 1)
+#define HL_DRV_RESET_HEARTBEAT (1 << 2)
+#define HL_DRV_RESET_TDR (1 << 3)
+#define HL_DRV_RESET_DEV_RELEASE (1 << 4)
+#define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5)
+#define HL_DRV_RESET_FW_FATAL_ERR (1 << 6)
#define HL_MAX_SOBS_PER_MONITOR 8
@@ -219,6 +221,7 @@ enum hl_fw_component {
/**
* enum hl_fw_types - F/W types present in the system
+ * @FW_TYPE_NONE: no FW component indication
* @FW_TYPE_LINUX: Linux image for device CPU
* @FW_TYPE_BOOT_CPU: Boot image for device CPU
* @FW_TYPE_PREBOOT_CPU: Indicates pre-loaded CPUs are present in the system
@@ -226,6 +229,7 @@ enum hl_fw_component {
* @FW_TYPE_ALL_TYPES: Mask for all types
*/
enum hl_fw_types {
+ FW_TYPE_NONE = 0x0,
FW_TYPE_LINUX = 0x1,
FW_TYPE_BOOT_CPU = 0x2,
FW_TYPE_PREBOOT_CPU = 0x4,
@@ -353,6 +357,21 @@ enum vm_type {
};
/**
+ * enum mmu_op_flags - mmu operation relevant information.
+ * @MMU_OP_USERPTR: operation on user memory (host resident).
+ * @MMU_OP_PHYS_PACK: operation on DRAM (device resident).
+ * @MMU_OP_CLEAR_MEMCACHE: operation has to clear memcache.
+ * @MMU_OP_SKIP_LOW_CACHE_INV: operation is allowed to skip parts of cache invalidation.
+ */
+enum mmu_op_flags {
+ MMU_OP_USERPTR = 0x1,
+ MMU_OP_PHYS_PACK = 0x2,
+ MMU_OP_CLEAR_MEMCACHE = 0x4,
+ MMU_OP_SKIP_LOW_CACHE_INV = 0x8,
+};
+
+
+/**
* enum hl_device_hw_state - H/W device state. use this to understand whether
* to do reset before hw_init or not
* @HL_DEVICE_HW_STATE_CLEAN: H/W state is clean. i.e. after hard reset
@@ -382,6 +401,7 @@ enum hl_device_hw_state {
* @hop3_mask: mask to get the PTE address in hop 3.
* @hop4_mask: mask to get the PTE address in hop 4.
* @hop5_mask: mask to get the PTE address in hop 5.
+ * @last_mask: mask to get the bit indicating this is the last hop.
* @page_size: default page size used to allocate memory.
* @num_hops: The amount of hops supported by the translation table.
* @host_resident: Should the MMU page table reside in host memory or in the
@@ -402,6 +422,7 @@ struct hl_mmu_properties {
u64 hop3_mask;
u64 hop4_mask;
u64 hop5_mask;
+ u64 last_mask;
u32 page_size;
u32 num_hops;
u8 host_resident;
@@ -524,6 +545,15 @@ struct hl_hints_range {
* @dynamic_fw_load: is dynamic FW load is supported.
* @gic_interrupts_enable: true if FW is not blocking GIC controller,
* false otherwise.
+ * @use_get_power_for_reset_history: To support backward compatibility for Goya
+ * and Gaudi
+ * @supports_soft_reset: is soft reset supported.
+ * @allow_inference_soft_reset: true if the ASIC supports soft reset that is
+ * initiated by user or TDR. This is only true
+ * in inference ASICs, as there is no real-world
+ * use-case of doing soft-reset in training (due
+ * to the fact that training runs on multiple
+ * devices)
*/
struct asic_fixed_properties {
struct hw_queue_properties *hw_queues_props;
@@ -604,6 +634,9 @@ struct asic_fixed_properties {
u8 iatu_done_by_fw;
u8 dynamic_fw_load;
u8 gic_interrupts_enable;
+ u8 use_get_power_for_reset_history;
+ u8 supports_soft_reset;
+ u8 allow_inference_soft_reset;
};
/**
@@ -852,10 +885,15 @@ struct hl_user_interrupt {
* pending on an interrupt
* @wait_list_node: node in the list of user threads pending on an interrupt
* @fence: hl fence object for interrupt completion
+ * @cq_target_value: CQ target value
+ * @cq_kernel_addr: CQ kernel address, to be used in the cq interrupt
+ * handler for taget value comparison
*/
struct hl_user_pending_interrupt {
struct list_head wait_list_node;
struct hl_fence fence;
+ u64 cq_target_value;
+ u64 *cq_kernel_addr;
};
/**
@@ -1010,6 +1048,7 @@ struct fw_response {
* @image_region: region to copy the FW image to
* @fw_image_size: size of FW image to load
* @wait_for_bl_timeout: timeout for waiting for boot loader to respond
+ * @fw_desc_valid: true if FW descriptor has been validated and hence the data can be used
*/
struct dynamic_fw_load_mgr {
struct fw_response response;
@@ -1017,6 +1056,7 @@ struct dynamic_fw_load_mgr {
struct pci_mem_region *image_region;
size_t fw_image_size;
u32 wait_for_bl_timeout;
+ bool fw_desc_valid;
};
/**
@@ -1042,7 +1082,8 @@ struct fw_image_props {
* @skip_bmc: should BMC be skipped
* @sram_bar_id: SRAM bar ID
* @dram_bar_id: DRAM bar ID
- * @linux_loaded: true if linux was loaded so far
+ * @fw_comp_loaded: bitmask of loaded FW components. set bit meaning loaded
+ * component. values are set according to enum hl_fw_types.
*/
struct fw_load_mgr {
union {
@@ -1056,7 +1097,7 @@ struct fw_load_mgr {
u8 skip_bmc;
u8 sram_bar_id;
u8 dram_bar_id;
- u8 linux_loaded;
+ u8 fw_comp_loaded;
};
/**
@@ -1128,7 +1169,7 @@ struct fw_load_mgr {
* @disable_clock_gating: disable clock gating completely
* @debug_coresight: perform certain actions on Coresight for debugging.
* @is_device_idle: return true if device is idle, false otherwise.
- * @soft_reset_late_init: perform certain actions needed after soft reset.
+ * @non_hard_reset_late_init: perform certain actions needed after a reset which is not hard-reset
* @hw_queues_lock: acquire H/W queues lock.
* @hw_queues_unlock: release H/W queues lock.
* @get_pci_id: retrieve PCI ID.
@@ -1261,10 +1302,10 @@ struct hl_asic_funcs {
int (*send_heartbeat)(struct hl_device *hdev);
void (*set_clock_gating)(struct hl_device *hdev);
void (*disable_clock_gating)(struct hl_device *hdev);
- int (*debug_coresight)(struct hl_device *hdev, void *data);
+ int (*debug_coresight)(struct hl_device *hdev, struct hl_ctx *ctx, void *data);
bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
u8 mask_len, struct seq_file *s);
- int (*soft_reset_late_init)(struct hl_device *hdev);
+ int (*non_hard_reset_late_init)(struct hl_device *hdev);
void (*hw_queues_lock)(struct hl_device *hdev);
void (*hw_queues_unlock)(struct hl_device *hdev);
u32 (*get_pci_id)(struct hl_device *hdev);
@@ -1276,7 +1317,7 @@ struct hl_asic_funcs {
int (*init_iatu)(struct hl_device *hdev);
u32 (*rreg)(struct hl_device *hdev, u32 reg);
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
- void (*halt_coresight)(struct hl_device *hdev);
+ void (*halt_coresight)(struct hl_device *hdev, struct hl_ctx *ctx);
int (*ctx_init)(struct hl_ctx *ctx);
void (*ctx_fini)(struct hl_ctx *ctx);
int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
@@ -1518,6 +1559,9 @@ struct hl_userptr {
* @submission_time_jiffies: submission time of the cs
* @type: CS_TYPE_*.
* @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs.
+ * @sob_addr_offset: sob offset from the configuration base address.
+ * @initial_sob_count: count of completed signals in SOB before current submission of signal or
+ * cs with encaps signals.
* @submitted: true if CS was submitted to H/W.
* @completed: true if CS was completed by device.
* @timedout : true if CS was timedout.
@@ -1553,6 +1597,8 @@ struct hl_cs {
u64 submission_time_jiffies;
enum hl_cs_type type;
u32 encaps_sig_hdl_id;
+ u32 sob_addr_offset;
+ u16 initial_sob_count;
u8 submitted;
u8 completed;
u8 timedout;
@@ -1792,7 +1838,6 @@ struct hl_debug_params {
* @dev_node: node in the device list of file private data
* @refcount: number of related contexts.
* @restore_phase_mutex: lock for context switch and restore phase.
- * @is_control: true for control device, false otherwise
*/
struct hl_fpriv {
struct hl_device *hdev;
@@ -1805,7 +1850,6 @@ struct hl_fpriv {
struct list_head dev_node;
struct kref refcount;
struct mutex restore_phase_mutex;
- u8 is_control;
};
@@ -1864,6 +1908,7 @@ struct hl_debugfs_entry {
* @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
* @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read.
* @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read.
+ * @i2c_len: generic u8 debugfs file for length value to use in i2c_data_read.
*/
struct hl_dbg_device_entry {
struct dentry *root;
@@ -1892,6 +1937,7 @@ struct hl_dbg_device_entry {
u8 i2c_bus;
u8 i2c_addr;
u8 i2c_reg;
+ u8 i2c_len;
};
/**
@@ -2180,13 +2226,13 @@ struct hwmon_chip_info;
* @wq: work queue for device reset procedure.
* @reset_work: reset work to be done.
* @hdev: habanalabs device structure.
- * @fw_reset: whether f/w will do the reset without us sending them a message to do it.
+ * @flags: reset flags.
*/
struct hl_device_reset_work {
struct workqueue_struct *wq;
struct delayed_work reset_work;
struct hl_device *hdev;
- bool fw_reset;
+ u32 flags;
};
/**
@@ -2328,12 +2374,10 @@ struct multi_cs_completion {
* @ctx: pointer to the context structure
* @fence_arr: array of fences of all CSs
* @seq_arr: array of CS sequence numbers
- * @timeout_us: timeout in usec for waiting for CS to complete
+ * @timeout_jiffies: timeout in jiffies for waiting for CS to complete
* @timestamp: timestamp of first completed CS
* @wait_status: wait for CS status
* @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
- * @stream_master_qid_map: bitmap of all stream master QIDs on which the
- * multi-CS is waiting
* @arr_len: fence_arr and seq_arr array length
* @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
* @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
@@ -2342,17 +2386,114 @@ struct multi_cs_data {
struct hl_ctx *ctx;
struct hl_fence **fence_arr;
u64 *seq_arr;
- s64 timeout_us;
+ s64 timeout_jiffies;
s64 timestamp;
long wait_status;
u32 completion_bitmap;
- u32 stream_master_qid_map;
u8 arr_len;
u8 gone_cs;
u8 update_ts;
};
/**
+ * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp
+ * @start: timestamp taken when 'start' event is received in driver
+ * @end: timestamp taken when 'end' event is received in driver
+ */
+struct hl_clk_throttle_timestamp {
+ ktime_t start;
+ ktime_t end;
+};
+
+/**
+ * struct hl_clk_throttle - keeps current/last clock throttling timestamps
+ * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER
+ * index 1 refers to THERMAL
+ * @lock: protects this structure as it can be accessed from both event queue
+ * context and info_ioctl context
+ * @current_reason: bitmask represents the current clk throttling reasons
+ * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load
+ */
+struct hl_clk_throttle {
+ struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX];
+ struct mutex lock;
+ u32 current_reason;
+ u32 aggregated_reason;
+};
+
+/**
+ * struct last_error_session_info - info about last session in which CS timeout or
+ * razwi error occurred.
+ * @open_dev_timestamp: device open timestamp.
+ * @cs_timeout_timestamp: CS timeout timestamp.
+ * @razwi_timestamp: razwi timestamp.
+ * @cs_write_disable: if set writing to CS parameters in the structure is disabled so the
+ * first (root cause) CS timeout will not be overwritten.
+ * @razwi_write_disable: if set writing to razwi parameters in the structure is disabled so the
+ * first (root cause) razwi will not be overwritten.
+ * @cs_timeout_seq: CS timeout sequence number.
+ * @razwi_addr: address that caused razwi.
+ * @razwi_engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does
+ * not have engine id it will be set to U16_MAX.
+ * @razwi_engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
+ * engines which one them caused the razwi. In that case, it will contain the
+ * second possible engine id, otherwise it will be set to U16_MAX.
+ * @razwi_non_engine_initiator: in case the initiator of the razwi does not have engine id.
+ * @razwi_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ */
+struct last_error_session_info {
+ ktime_t open_dev_timestamp;
+ ktime_t cs_timeout_timestamp;
+ ktime_t razwi_timestamp;
+ atomic_t cs_write_disable;
+ atomic_t razwi_write_disable;
+ u64 cs_timeout_seq;
+ u64 razwi_addr;
+ u16 razwi_engine_id_1;
+ u16 razwi_engine_id_2;
+ u8 razwi_non_engine_initiator;
+ u8 razwi_type;
+};
+
+/**
+ * struct hl_reset_info - holds current device reset information.
+ * @lock: lock to protect critical reset flows.
+ * @soft_reset_cnt: number of soft reset since the driver was loaded.
+ * @hard_reset_cnt: number of hard reset since the driver was loaded.
+ * @hard_reset_schedule_flags: hard reset is scheduled to after current soft reset,
+ * here we hold the hard reset flags.
+ * @in_reset: is device in reset flow.
+ * @is_in_soft_reset: Device is currently in soft reset process.
+ * @needs_reset: true if reset_on_lockup is false and device should be reset
+ * due to lockup.
+ * @hard_reset_pending: is there a hard reset work pending.
+ * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
+ * triggered, and cleared after it is shared with preboot.
+ * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
+ * with a new value on next reset
+ * @reset_trigger_repeated: set if device reset is triggered more than once with
+ * same cause.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ * complete instead.
+ */
+struct hl_reset_info {
+ spinlock_t lock;
+ u32 soft_reset_cnt;
+ u32 hard_reset_cnt;
+ u32 hard_reset_schedule_flags;
+ u8 in_reset;
+ u8 is_in_soft_reset;
+ u8 needs_reset;
+ u8 hard_reset_pending;
+
+ u8 curr_reset_cause;
+ u8 prev_reset_trigger;
+ u8 reset_trigger_repeated;
+
+ u8 skip_reset_on_timeout;
+};
+
+/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
* @pcie_bar_phys: array of available PCIe bars physical addresses.
@@ -2363,7 +2504,6 @@ struct multi_cs_data {
* @cdev_ctrl: char device for control operations only (INFO IOCTL)
* @dev: related kernel basic device structure.
* @dev_ctrl: related kernel device structure for the control device
- * @work_freq: delayed work to lower device frequency if possible.
* @work_heartbeat: delayed work for CPU-CP is-alive check.
* @device_reset_work: delayed work which performs hard reset
* @asic_name: ASIC specific name.
@@ -2398,7 +2538,6 @@ struct multi_cs_data {
* @asic_specific: ASIC specific information to use only from ASIC files.
* @vm: virtual memory manager for MMU.
* @hwmon_dev: H/W monitor device.
- * @pm_mng_profile: current power management profile.
* @hl_chip_info: ASIC's sensors information.
* @device_status_description: device status description.
* @hl_debugfs: device's debugfs manager.
@@ -2410,8 +2549,10 @@ struct multi_cs_data {
* @internal_cb_va_base: internal cb pool mmu virtual address base
* @fpriv_list: list of file private data structures. Each structure is created
* when a user opens the device
+ * @fpriv_ctrl_list: list of file private data structures. Each structure is created
+ * when a user opens the control device
* @fpriv_list_lock: protects the fpriv_list
- * @compute_ctx: current compute context executing.
+ * @fpriv_ctrl_list_lock: protects the fpriv_ctrl_list
* @aggregated_cs_counters: aggregated cs counters among all contexts
* @mmu_priv: device-specific MMU data.
* @mmu_func: device-related MMU functions.
@@ -2419,6 +2560,10 @@ struct multi_cs_data {
* @pci_mem_region: array of memory regions in the PCI
* @state_dump_specs: constants and dictionaries needed to dump system state.
* @multi_cs_completion: array of multi-CS completion.
+ * @clk_throttling: holds information about current/previous clock throttling events
+ * @reset_info: holds current device reset information.
+ * @last_error: holds information about last session in which CS timeout or razwi error occurred.
+ * @stream_master_qid_arr: pointer to array with QIDs of master streams.
* @dram_used_mem: current DRAM memory consumption.
* @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2434,20 +2579,17 @@ struct multi_cs_data {
* device initialization. Mainly used to debug and
* workaround firmware bugs
* @dram_pci_bar_start: start bus address of PCIe bar towards DRAM.
+ * @last_successful_open_ktime: timestamp (ktime) of the last successful device open.
* @last_successful_open_jif: timestamp (jiffies) of the last successful
* device open.
* @last_open_session_duration_jif: duration (jiffies) of the last device open
* session.
* @open_counter: number of successful device open operations.
- * @in_reset: is device in reset flow.
- * @curr_pll_profile: current PLL profile.
+ * @fw_poll_interval_usec: FW status poll interval in usec.
* @card_type: Various ASICs have several card types. This indicates the card
* type of the current device.
* @major: habanalabs kernel driver major.
* @high_pll: high PLL profile frequency.
- * @soft_reset_cnt: number of soft reset since the driver was loaded.
- * @hard_reset_cnt: number of hard reset since the driver was loaded.
- * @clk_throttling_reason: bitmask represents the current clk throttling reasons
* @id: device minor.
* @id_control: minor of the control device
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -2455,7 +2597,6 @@ struct multi_cs_data {
* @disabled: is device disabled.
* @late_init_done: is late init stage was done during initialization.
* @hwmon_initialized: is H/W monitor sensors was initialized.
- * @hard_reset_pending: is there a hard reset work pending.
* @heartbeat: is heartbeat sanity check towards CPU-CP enabled.
* @reset_on_lockup: true if a reset should be done in case of stuck CS, false
* otherwise.
@@ -2467,8 +2608,9 @@ struct multi_cs_data {
* @init_done: is the initialization of the device done.
* @device_cpu_disabled: is the device CPU disabled (due to timeouts)
* @dma_mask: the dma mask that was set for this device
- * @in_debug: is device under debug. This, together with fpriv_list, enforces
- * that only a single user is configuring the debug infrastructure.
+ * @in_debug: whether the device is in a state where the profiling/tracing infrastructure
+ * can be used. This indication is needed because in some ASICs we need to do
+ * specific operations to enable that infrastructure.
* @power9_64bit_dma_enable: true to enable 64-bit DMA mask support. Relevant
* only to POWER9 machines.
* @cdev_sysfs_created: were char devices and sysfs nodes created.
@@ -2477,34 +2619,18 @@ struct multi_cs_data {
* @sync_stream_queue_idx: helper index for sync stream queues initialization.
* @collective_mon_idx: helper index for collective initialization
* @supports_coresight: is CoreSight supported.
- * @supports_soft_reset: is soft reset supported.
- * @allow_inference_soft_reset: true if the ASIC supports soft reset that is
- * initiated by user or TDR. This is only true
- * in inference ASICs, as there is no real-world
- * use-case of doing soft-reset in training (due
- * to the fact that training runs on multiple
- * devices)
* @supports_cb_mapping: is mapping a CB to the device's MMU supported.
- * @needs_reset: true if reset_on_lockup is false and device should be reset
- * due to lockup.
* @process_kill_trial_cnt: number of trials reset thread tried killing
* user processes
* @device_fini_pending: true if device_fini was called and might be
* waiting for the reset thread to finish
* @supports_staged_submission: true if staged submissions are supported
- * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
- * triggered, and cleared after it is shared with preboot.
- * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
- * with a new value on next reset
- * @reset_trigger_repeated: set if device reset is triggered more than once with
- * same cause.
- * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
- * complete instead.
* @device_cpu_is_halted: Flag to indicate whether the device CPU was already
* halted. We can't halt it again because the COMMS
* protocol will throw an error. Relevant only for
* cases where Linux was not loaded to device CPU
* @supports_wait_for_multi_cs: true if wait for multi CS is supported
+ * @is_compute_ctx_active: Whether there is an active compute context executing.
*/
struct hl_device {
struct pci_dev *pdev;
@@ -2515,7 +2641,6 @@ struct hl_device {
struct cdev cdev_ctrl;
struct device *dev;
struct device *dev_ctrl;
- struct delayed_work work_freq;
struct delayed_work work_heartbeat;
struct hl_device_reset_work device_reset_work;
char asic_name[HL_STR_MAX];
@@ -2546,7 +2671,6 @@ struct hl_device {
void *asic_specific;
struct hl_vm vm;
struct device *hwmon_dev;
- enum hl_pm_mng_profile pm_mng_profile;
struct hwmon_chip_info *hl_chip_info;
struct hl_dbg_device_entry hl_debugfs;
@@ -2560,9 +2684,9 @@ struct hl_device {
u64 internal_cb_va_base;
struct list_head fpriv_list;
+ struct list_head fpriv_ctrl_list;
struct mutex fpriv_list_lock;
-
- struct hl_ctx *compute_ctx;
+ struct mutex fpriv_ctrl_list_lock;
struct hl_cs_counters_atomic aggregated_cs_counters;
@@ -2577,6 +2701,11 @@ struct hl_device {
struct multi_cs_completion multi_cs_completion[
MULTI_CS_MAX_USER_CTX];
+ struct hl_clk_throttle clk_throttling;
+ struct last_error_session_info last_error;
+
+ struct hl_reset_info reset_info;
+
u32 *stream_master_qid_arr;
atomic64_t dram_used_mem;
u64 timeout_jiffies;
@@ -2587,21 +2716,17 @@ struct hl_device {
u64 last_successful_open_jif;
u64 last_open_session_duration_jif;
u64 open_counter;
- atomic_t in_reset;
- enum hl_pll_frequency curr_pll_profile;
+ u64 fw_poll_interval_usec;
+ ktime_t last_successful_open_ktime;
enum cpucp_card_types card_type;
u32 major;
u32 high_pll;
- u32 soft_reset_cnt;
- u32 hard_reset_cnt;
- u32 clk_throttling_reason;
u16 id;
u16 id_control;
u16 cpu_pci_msb_addr;
u8 disabled;
u8 late_init_done;
u8 hwmon_initialized;
- u8 hard_reset_pending;
u8 heartbeat;
u8 reset_on_lockup;
u8 dram_default_page_mapping;
@@ -2618,20 +2743,14 @@ struct hl_device {
u8 sync_stream_queue_idx;
u8 collective_mon_idx;
u8 supports_coresight;
- u8 supports_soft_reset;
- u8 allow_inference_soft_reset;
u8 supports_cb_mapping;
- u8 needs_reset;
u8 process_kill_trial_cnt;
u8 device_fini_pending;
u8 supports_staged_submission;
- u8 curr_reset_cause;
- u8 prev_reset_trigger;
- u8 reset_trigger_repeated;
- u8 skip_reset_on_timeout;
u8 device_cpu_is_halted;
u8 supports_wait_for_multi_cs;
u8 stream_master_qid_arr_size;
+ u8 is_compute_ctx_active;
/* Parameters for bring-up */
u64 nic_ports_mask;
@@ -2659,6 +2778,7 @@ struct hl_device {
* wait cs are used to wait of the reserved encaps signals.
* @hdev: pointer to habanalabs device structure.
* @hw_sob: pointer to H/W SOB used in the reservation.
+ * @ctx: pointer to the user's context data structure
* @cs_seq: staged cs sequence which contains encapsulated signals
* @id: idr handler id to be used to fetch the handler info
* @q_idx: stream queue index
@@ -2669,6 +2789,7 @@ struct hl_cs_encaps_sig_handle {
struct kref refcount;
struct hl_device *hdev;
struct hl_hw_sob *hw_sob;
+ struct hl_ctx *ctx;
u64 cs_seq;
u32 id;
u32 q_idx;
@@ -2757,21 +2878,9 @@ static inline bool hl_mem_area_inside_range(u64 address, u64 size,
static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
u64 range_start_address, u64 range_end_address)
{
- u64 end_address = address + size;
+ u64 end_address = address + size - 1;
- if ((address >= range_start_address) &&
- (address < range_end_address))
- return true;
-
- if ((end_address >= range_start_address) &&
- (end_address < range_end_address))
- return true;
-
- if ((address < range_start_address) &&
- (end_address >= range_end_address))
- return true;
-
- return false;
+ return ((address <= range_end_address) && (range_start_address <= end_address));
}
int hl_device_open(struct inode *inode, struct file *filp);
@@ -2779,10 +2888,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp);
bool hl_device_operational(struct hl_device *hdev,
enum hl_device_status *status);
enum hl_device_status hl_device_status(struct hl_device *hdev);
-int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
-int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
- enum hl_asic_type asic_type, int minor);
-void destroy_hdev(struct hl_device *hdev);
+int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable);
int hl_hw_queues_create(struct hl_device *hdev);
void hl_hw_queues_destroy(struct hl_device *hdev);
int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
@@ -2821,6 +2927,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
void hl_ctx_do_release(struct kref *ref);
void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx);
int hl_ctx_put(struct hl_ctx *ctx);
+struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev);
struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
struct hl_fence **fence, u32 arr_len);
@@ -2834,7 +2941,6 @@ int hl_device_resume(struct hl_device *hdev);
int hl_device_reset(struct hl_device *hdev, u32 flags);
void hl_hpriv_get(struct hl_fpriv *hpriv);
int hl_hpriv_put(struct hl_fpriv *hpriv);
-int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
int hl_build_hwmon_channel_info(struct hl_device *hdev,
@@ -2915,6 +3021,9 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr,
u64 phys_addr, u32 size);
int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size);
+int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags);
+int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
+ u32 flags, u32 asid, u64 va, u64 size);
void hl_mmu_swap_out(struct hl_ctx *ctx);
void hl_mmu_swap_in(struct hl_ctx *ctx);
int hl_mmu_if_set_funcs(struct hl_device *hdev);
@@ -2969,6 +3078,10 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
struct fw_load_mgr *fw_loader,
enum comms_cmd cmd, unsigned int size,
bool wait_ok, u32 timeout);
+int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
+ struct cpucp_hbm_row_info *info);
+int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num);
+int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid);
int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
bool is_wc[3]);
int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data);