habanalabs: Manipulate DMA addresses in ASIC functions
authorTomer Tayar <ttayar@habana.ai>
Wed, 1 May 2019 08:28:15 +0000 (11:28 +0300)
committerOded Gabbay <oded.gabbay@gmail.com>
Wed, 1 May 2019 08:28:15 +0000 (11:28 +0300)
Routing device accesses to the host memory requires the usage of a base
offset, which is canceled by the iATU just before leaving the device.
The value of the base offset might be distinctive between different ASIC
types.
The manipulation of the addresses is currently used throughout the
driver code, and one should be aware to it whenever providing a host
memory address to the device.
This patch removes this manipulation from the driver common code, and
moves it to the ASIC specific functions that are responsible for
host memory allocation/mapping.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/misc/habanalabs/firmware_if.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/habanalabs.h
drivers/misc/habanalabs/hw_queue.c
drivers/misc/habanalabs/memory.c
drivers/misc/habanalabs/pci.c

index 1acf82650b206a96272502ded8e09024329778a7..eda5d7fcb79f269e1d01065c1b34d8080b75d5ca 100644 (file)
@@ -249,8 +249,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
 
        pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET <<
                                ARMCP_PKT_CTL_OPCODE_SHIFT);
-       pkt.addr = cpu_to_le64(armcp_info_dma_addr +
-                               prop->host_phys_base_address);
+       pkt.addr = cpu_to_le64(armcp_info_dma_addr);
        pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info));
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -281,7 +280,6 @@ out:
 
 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 {
-       struct asic_fixed_properties *prop = &hdev->asic_prop;
        struct armcp_packet pkt = {};
        void *eeprom_info_cpu_addr;
        dma_addr_t eeprom_info_dma_addr;
@@ -301,8 +299,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 
        pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET <<
                                ARMCP_PKT_CTL_OPCODE_SHIFT);
-       pkt.addr = cpu_to_le64(eeprom_info_dma_addr +
-                               prop->host_phys_base_address);
+       pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
        pkt.data_max_size = cpu_to_le32(max_size);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
index 8e18c80a22e71cb4f9a61d75bcc35efaaefdc42b..31dc3b872f9e2b8a88cef729d14cf08586a9b9d6 100644 (file)
@@ -345,7 +345,6 @@ void goya_get_fixed_properties(struct hl_device *hdev)
        prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
        prop->dram_page_size = PAGE_SIZE_2MB;
 
-       prop->host_phys_base_address = HOST_PHYS_BASE;
        prop->va_space_host_start_address = VA_HOST_SPACE_START;
        prop->va_space_host_end_address = VA_HOST_SPACE_END;
        prop->va_space_dram_start_address = VA_DDR_SPACE_START;
@@ -422,7 +421,7 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
 static int goya_init_iatu(struct hl_device *hdev)
 {
        return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE,
-                               HOST_PHYS_SIZE);
+                               HOST_PHYS_BASE, HOST_PHYS_SIZE);
 }
 
 /*
@@ -804,7 +803,6 @@ void goya_init_dma_qmans(struct hl_device *hdev)
 {
        struct goya_device *goya = hdev->asic_specific;
        struct hl_hw_queue *q;
-       dma_addr_t bus_address;
        int i;
 
        if (goya->hw_cap_initialized & HW_CAP_DMA)
@@ -813,10 +811,7 @@ void goya_init_dma_qmans(struct hl_device *hdev)
        q = &hdev->kernel_queues[0];
 
        for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) {
-               bus_address = q->bus_address +
-                               hdev->asic_prop.host_phys_base_address;
-
-               goya_init_dma_qman(hdev, i, bus_address);
+               goya_init_dma_qman(hdev, i, q->bus_address);
                goya_init_dma_ch(hdev, i);
        }
 
@@ -957,7 +952,6 @@ int goya_init_cpu_queues(struct hl_device *hdev)
 {
        struct goya_device *goya = hdev->asic_specific;
        struct hl_eq *eq;
-       dma_addr_t bus_address;
        u32 status;
        struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
        int err;
@@ -970,19 +964,18 @@ int goya_init_cpu_queues(struct hl_device *hdev)
 
        eq = &hdev->event_queue;
 
-       bus_address = cpu_pq->bus_address +
-                       hdev->asic_prop.host_phys_base_address;
-       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address));
-       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address));
+       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0,
+                       lower_32_bits(cpu_pq->bus_address));
+       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1,
+                       upper_32_bits(cpu_pq->bus_address));
 
-       bus_address = eq->bus_address + hdev->asic_prop.host_phys_base_address;
-       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(bus_address));
-       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(bus_address));
+       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(eq->bus_address));
+       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(eq->bus_address));
 
-       bus_address = hdev->cpu_accessible_dma_address +
-                       hdev->asic_prop.host_phys_base_address;
-       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address));
-       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address));
+       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8,
+                       lower_32_bits(hdev->cpu_accessible_dma_address));
+       WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9,
+                       upper_32_bits(hdev->cpu_accessible_dma_address));
 
        WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES);
        WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_4, HL_EQ_SIZE_IN_BYTES);
@@ -2731,13 +2724,23 @@ void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val)
 static void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
                                        dma_addr_t *dma_handle, gfp_t flags)
 {
-       return dma_alloc_coherent(&hdev->pdev->dev, size, dma_handle, flags);
+       void *kernel_addr = dma_alloc_coherent(&hdev->pdev->dev, size,
+                                               dma_handle, flags);
+
+       /* Shift to the device's base physical address of host memory */
+       if (kernel_addr)
+               *dma_handle += HOST_PHYS_BASE;
+
+       return kernel_addr;
 }
 
 static void goya_dma_free_coherent(struct hl_device *hdev, size_t size,
                                        void *cpu_addr, dma_addr_t dma_handle)
 {
-       dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle);
+       /* Cancel the device's base physical address of host memory */
+       dma_addr_t fixed_dma_handle = dma_handle - HOST_PHYS_BASE;
+
+       dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle);
 }
 
 void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
@@ -2848,8 +2851,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
                        (1 << GOYA_PKT_CTL_MB_SHIFT);
        fence_pkt->ctl = cpu_to_le32(tmp);
        fence_pkt->value = cpu_to_le32(GOYA_QMAN0_FENCE_VAL);
-       fence_pkt->addr = cpu_to_le64(fence_dma_addr +
-                                       hdev->asic_prop.host_phys_base_address);
+       fence_pkt->addr = cpu_to_le64(fence_dma_addr);
 
        rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0,
                                        job->job_cb_size, cb->bus_address);
@@ -2928,8 +2930,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
                        (1 << GOYA_PKT_CTL_MB_SHIFT);
        fence_pkt->ctl = cpu_to_le32(tmp);
        fence_pkt->value = cpu_to_le32(fence_val);
-       fence_pkt->addr = cpu_to_le64(fence_dma_addr +
-                                       hdev->asic_prop.host_phys_base_address);
+       fence_pkt->addr = cpu_to_le64(fence_dma_addr);
 
        rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id,
                                        sizeof(struct packet_msg_prot),
@@ -3001,16 +3002,27 @@ int goya_test_queues(struct hl_device *hdev)
 static void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size,
                                        gfp_t mem_flags, dma_addr_t *dma_handle)
 {
+       void *kernel_addr;
+
        if (size > GOYA_DMA_POOL_BLK_SIZE)
                return NULL;
 
-       return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
+       kernel_addr =  dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
+
+       /* Shift to the device's base physical address of host memory */
+       if (kernel_addr)
+               *dma_handle += HOST_PHYS_BASE;
+
+       return kernel_addr;
 }
 
 static void goya_dma_pool_free(struct hl_device *hdev, void *vaddr,
                                dma_addr_t dma_addr)
 {
-       dma_pool_free(hdev->dma_pool, vaddr, dma_addr);
+       /* Cancel the device's base physical address of host memory */
+       dma_addr_t fixed_dma_addr = dma_addr - HOST_PHYS_BASE;
+
+       dma_pool_free(hdev->dma_pool, vaddr, fixed_dma_addr);
 }
 
 void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
@@ -3025,19 +3037,33 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
        hl_fw_cpu_accessible_dma_pool_free(hdev, size, vaddr);
 }
 
-static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg,
+static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sgl,
                                int nents, enum dma_data_direction dir)
 {
-       if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir))
+       struct scatterlist *sg;
+       int i;
+
+       if (!dma_map_sg(&hdev->pdev->dev, sgl, nents, dir))
                return -ENOMEM;
 
+       /* Shift to the device's base physical address of host memory */
+       for_each_sg(sgl, sg, nents, i)
+               sg->dma_address += HOST_PHYS_BASE;
+
        return 0;
 }
 
-static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg,
+static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sgl,
                                int nents, enum dma_data_direction dir)
 {
-       dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir);
+       struct scatterlist *sg;
+       int i;
+
+       /* Cancel the device's base physical address of host memory */
+       for_each_sg(sgl, sg, nents, i)
+               sg->dma_address -= HOST_PHYS_BASE;
+
+       dma_unmap_sg(&hdev->pdev->dev, sgl, nents, dir);
 }
 
 u32 goya_get_dma_desc_list_size(struct hl_device *hdev, struct sg_table *sgt)
@@ -3589,8 +3615,6 @@ static int goya_patch_dma_packet(struct hl_device *hdev,
                new_dma_pkt->ctl = cpu_to_le32(ctl);
                new_dma_pkt->tsize = cpu_to_le32((u32) len);
 
-               dma_addr += hdev->asic_prop.host_phys_base_address;
-
                if (dir == DMA_TO_DEVICE) {
                        new_dma_pkt->src_addr = cpu_to_le64(dma_addr);
                        new_dma_pkt->dst_addr = cpu_to_le64(device_memory_addr);
index b64594be6dbd1f66f56671c58f990a0da6b86503..f08f71982585061c8eed4e4a165b35c1f6d78b29 100644 (file)
@@ -135,8 +135,6 @@ enum hl_device_hw_state {
  * @dram_user_base_address: DRAM physical start address for user access.
  * @dram_size: DRAM total size.
  * @dram_pci_bar_size: size of PCI bar towards DRAM.
- * @host_phys_base_address: base physical address of host memory for
- *                             transactions that the device generates.
  * @max_power_default: max power of the device after reset
  * @va_space_host_start_address: base address of virtual memory range for
  *                               mapping host memory.
@@ -184,7 +182,6 @@ struct asic_fixed_properties {
        u64                     dram_user_base_address;
        u64                     dram_size;
        u64                     dram_pci_bar_size;
-       u64                     host_phys_base_address;
        u64                     max_power_default;
        u64                     va_space_host_start_address;
        u64                     va_space_host_end_address;
@@ -537,11 +534,11 @@ struct hl_asic_funcs {
        void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
                                size_t size, void *vaddr);
        void (*hl_dma_unmap_sg)(struct hl_device *hdev,
-                               struct scatterlist *sg, int nents,
+                               struct scatterlist *sgl, int nents,
                                enum dma_data_direction dir);
        int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
        int (*asic_dma_map_sg)(struct hl_device *hdev,
-                               struct scatterlist *sg, int nents,
+                               struct scatterlist *sgl, int nents,
                                enum dma_data_direction dir);
        u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
                                        struct sg_table *sgt);
@@ -1450,7 +1447,8 @@ int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
 int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
                                u64 addr);
 int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
-                       u64 dram_base_address, u64 host_phys_size);
+                       u64 dram_base_address, u64 host_phys_base_address,
+                       u64 host_phys_size);
 int hl_pci_init(struct hl_device *hdev, u8 dma_mask);
 void hl_pci_fini(struct hl_device *hdev);
 int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
index 6cdaa117fc40aa7be1918a7e1361fe6c5b059c17..2894d8975933481ea4501ffcdab156b70a79d8cc 100644 (file)
@@ -82,7 +82,7 @@ static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
        bd += hl_pi_2_offset(q->pi);
        bd->ctl = __cpu_to_le32(ctl);
        bd->len = __cpu_to_le32(len);
-       bd->ptr = __cpu_to_le64(ptr + hdev->asic_prop.host_phys_base_address);
+       bd->ptr = __cpu_to_le64(ptr);
 
        q->pi = hl_queue_inc_ptr(q->pi);
        hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
@@ -263,9 +263,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
         * checked in hl_queue_sanity_checks
         */
        cq = &hdev->completion_queue[q->hw_queue_id];
-       cq_addr = cq->bus_address +
-                       hdev->asic_prop.host_phys_base_address;
-       cq_addr += cq->pi * sizeof(struct hl_cq_entry);
+       cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
 
        hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len,
                                                cq_addr,
index 43ef3ad8438ab6dc2d276237e95df2b17e44ec99..d67d24c13efd8444be07f4c9f0481dd4ca20d3ac 100644 (file)
@@ -759,10 +759,6 @@ static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr,
        for (i = 0 ; i < phys_pg_pack->npages ; i++) {
                paddr = phys_pg_pack->pages[i];
 
-               /* For accessing the host we need to turn on bit 39 */
-               if (phys_pg_pack->created_from_userptr)
-                       paddr += hdev->asic_prop.host_phys_base_address;
-
                rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size);
                if (rc) {
                        dev_err(hdev->dev,
index 5278f086d65d3719ebd18add94731a4c0aef1ace..0e78a04d63f45286ad957e9094cee43df575883d 100644 (file)
@@ -236,6 +236,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
  * @hdev: Pointer to hl_device structure.
  * @sram_base_address: SRAM base address.
  * @dram_base_address: DRAM base address.
+ * @host_phys_base_address: Base physical address of host memory for device
+ *                          transactions.
  * @host_phys_size: Size of host memory for device transactions.
  *
  * This is needed in case the firmware doesn't initialize the iATU.
@@ -243,7 +245,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
  * Return: 0 on success, negative value for failure.
  */
 int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
-                       u64 dram_base_address, u64 host_phys_size)
+                       u64 dram_base_address, u64 host_phys_base_address,
+                       u64 host_phys_size)
 {
        struct asic_fixed_properties *prop = &hdev->asic_prop;
        u64 host_phys_end_addr;
@@ -265,11 +268,11 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
 
 
        /* Outbound Region 0 - Point to Host */
-       host_phys_end_addr = prop->host_phys_base_address + host_phys_size - 1;
+       host_phys_end_addr = host_phys_base_address + host_phys_size - 1;
        rc |= hl_pci_iatu_write(hdev, 0x008,
-                               lower_32_bits(prop->host_phys_base_address));
+                               lower_32_bits(host_phys_base_address));
        rc |= hl_pci_iatu_write(hdev, 0x00C,
-                               upper_32_bits(prop->host_phys_base_address));
+                               upper_32_bits(host_phys_base_address));
        rc |= hl_pci_iatu_write(hdev, 0x010, lower_32_bits(host_phys_end_addr));
        rc |= hl_pci_iatu_write(hdev, 0x014, 0);
        rc |= hl_pci_iatu_write(hdev, 0x018, 0);