tilegx pci: support I/O to arbitrarily-cached pages

The tilegx PCI root complex support (currently only in linux-next) is limited to pages that are homed on cached in the default manner, i.e. "hash-for-home". This change supports delivery of I/O data to pages that are cached in other ways (locally on a particular core, uncached, user-managed incoherent, etc.). A large part of the change is supporting flushing pages from cache on particular homes so that we can transition the data that we are delivering to or from the device appropriately. The new homecache_finv* routines handle this. Some changes to page_table_range_init() were also required to make the fixmap code work correctly on tilegx; it hadn't been used there before. We also remove some stub mark_caches_evicted_*() routines that were just no-ops anyway. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
author: Chris Metcalf <cmetcalf@tilera.com> 2012-06-13 14:46:40 -0400
committer: Chris Metcalf <cmetcalf@tilera.com> 2012-07-18 16:40:05 -0400
commit: bbaa22c3a0d0be4406d26e5a73d1e8e504787986 (patch)
tree: 4d00f1bda85d9735c60d7db1cdbdd215d5317ae4 /arch/tile/kernel
parent: 3e219b91533058e242b78ac08aaa91024dd6f369 (diff)
1 files changed, 143 insertions, 39 deletions
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index b3ed19f8779..9814d7082f2 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -22,9 +22,15 @@
 /* Generic DMA mapping functions: */
 
 /*
- * Allocate what Linux calls "coherent" memory, which for us just
- * means uncached.
+ * Allocate what Linux calls "coherent" memory.  On TILEPro this is
+ * uncached memory; on TILE-Gx it is hash-for-home memory.
  */
+#ifdef __tilepro__
+#define PAGE_HOME_DMA PAGE_HOME_UNCACHED
+#else
+#define PAGE_HOME_DMA PAGE_HOME_HASH
+#endif
+
 void *dma_alloc_coherent(struct device *dev,
 			 size_t size,
 			 dma_addr_t *dma_handle,
@@ -48,13 +54,13 @@ void *dma_alloc_coherent(struct device *dev,
 	if (dma_mask <= DMA_BIT_MASK(32))
 		node = 0;
 
-	pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_UNCACHED);
+	pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_DMA);
 	if (pg == NULL)
 		return NULL;
 
 	addr = page_to_phys(pg);
 	if (addr + size > dma_mask) {
-		homecache_free_pages(addr, order);
+		__homecache_free_pages(pg, order);
 		return NULL;
 	}
 
@@ -87,22 +93,110 @@ EXPORT_SYMBOL(dma_free_coherent);
  * can count on nothing having been touched.
  */
 
-/* Flush a PA range from cache page by page. */
-static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size)
+/* Set up a single page for DMA access. */
+static void __dma_prep_page(struct page *page, unsigned long offset,
+			    size_t size, enum dma_data_direction direction)
 {
-	struct page *page = pfn_to_page(PFN_DOWN(dma_addr));
-	size_t bytesleft = PAGE_SIZE - (dma_addr & (PAGE_SIZE - 1));
+	/*
+	 * Flush the page from cache if necessary.
+	 * On tilegx, data is delivered to hash-for-home L3; on tilepro,
+	 * data is delivered direct to memory.
+	 *
+	 * NOTE: If we were just doing DMA_TO_DEVICE we could optimize
+	 * this to be a "flush" not a "finv" and keep some of the
+	 * state in cache across the DMA operation, but it doesn't seem
+	 * worth creating the necessary flush_buffer_xxx() infrastructure.
+	 */
+	int home = page_home(page);
+	switch (home) {
+	case PAGE_HOME_HASH:
+#ifdef __tilegx__
+		return;
+#endif
+		break;
+	case PAGE_HOME_UNCACHED:
+#ifdef __tilepro__
+		return;
+#endif
+		break;
+	case PAGE_HOME_IMMUTABLE:
+		/* Should be going to the device only. */
+		BUG_ON(direction == DMA_FROM_DEVICE ||
+		       direction == DMA_BIDIRECTIONAL);
+		return;
+	case PAGE_HOME_INCOHERENT:
+		/* Incoherent anyway, so no need to work hard here. */
+		return;
+	default:
+		BUG_ON(home < 0 || home >= NR_CPUS);
+		break;
+	}
+	homecache_finv_page(page);
+
+#ifdef DEBUG_ALIGNMENT
+	/* Warn if the region isn't cacheline aligned. */
+	if (offset & (L2_CACHE_BYTES - 1) || (size & (L2_CACHE_BYTES - 1)))
+		pr_warn("Unaligned DMA to non-hfh memory: PA %#llx/%#lx\n",
+			PFN_PHYS(page_to_pfn(page)) + offset, size);
+#endif
+}
 
-	while ((ssize_t)size > 0) {
-		/* Flush the page. */
-		homecache_flush_cache(page++, 0);
+/* Make the page ready to be read by the core. */
+static void __dma_complete_page(struct page *page, unsigned long offset,
+				size_t size, enum dma_data_direction direction)
+{
+#ifdef __tilegx__
+	switch (page_home(page)) {
+	case PAGE_HOME_HASH:
+		/* I/O device delivered data the way the cpu wanted it. */
+		break;
+	case PAGE_HOME_INCOHERENT:
+		/* Incoherent anyway, so no need to work hard here. */
+		break;
+	case PAGE_HOME_IMMUTABLE:
+		/* Extra read-only copies are not a problem. */
+		break;
+	default:
+		/* Flush the bogus hash-for-home I/O entries to memory. */
+		homecache_finv_map_page(page, PAGE_HOME_HASH);
+		break;
+	}
+#endif
+}
 
-		/* Figure out if we need to continue on the next page. */
-		size -= bytesleft;
-		bytesleft = PAGE_SIZE;
+static void __dma_prep_pa_range(dma_addr_t dma_addr, size_t size,
+				enum dma_data_direction direction)
+{
+	struct page *page = pfn_to_page(PFN_DOWN(dma_addr));
+	unsigned long offset = dma_addr & (PAGE_SIZE - 1);
+	size_t bytes = min(size, (size_t)(PAGE_SIZE - offset));
+
+	while (size != 0) {
+		__dma_prep_page(page, offset, bytes, direction);
+		size -= bytes;
+		++page;
+		offset = 0;
+		bytes = min((size_t)PAGE_SIZE, size);
+	}
+}
+
+static void __dma_complete_pa_range(dma_addr_t dma_addr, size_t size,
+				    enum dma_data_direction direction)
+{
+	struct page *page = pfn_to_page(PFN_DOWN(dma_addr));
+	unsigned long offset = dma_addr & (PAGE_SIZE - 1);
+	size_t bytes = min(size, (size_t)(PAGE_SIZE - offset));
+
+	while (size != 0) {
+		__dma_complete_page(page, offset, bytes, direction);
+		size -= bytes;
+		++page;
+		offset = 0;
+		bytes = min((size_t)PAGE_SIZE, size);
 	}
 }
 
+
 /*
  * dma_map_single can be passed any memory address, and there appear
  * to be no alignment constraints.
@@ -111,28 +205,29 @@ static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size)
  * line with some other data that has been touched in the meantime.
  */
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
-	       enum dma_data_direction direction)
+			  enum dma_data_direction direction)
 {
 	dma_addr_t dma_addr = __pa(ptr);
 
 	BUG_ON(!valid_dma_direction(direction));
 	WARN_ON(size == 0);
 
-	__dma_map_pa_range(dma_addr, size);
+	__dma_prep_pa_range(dma_addr, size, direction);
 
 	return dma_addr;
 }
 EXPORT_SYMBOL(dma_map_single);
 
 void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		 enum dma_data_direction direction)
+		      enum dma_data_direction direction)
 {
 	BUG_ON(!valid_dma_direction(direction));
+	__dma_complete_pa_range(dma_addr, size, direction);
 }
 EXPORT_SYMBOL(dma_unmap_single);
 
 int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
-	   enum dma_data_direction direction)
+	       enum dma_data_direction direction)
 {
 	struct scatterlist *sg;
 	int i;
@@ -143,17 +238,25 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
 
 	for_each_sg(sglist, sg, nents, i) {
 		sg->dma_address = sg_phys(sg);
-		__dma_map_pa_range(sg->dma_address, sg->length);
+		__dma_prep_pa_range(sg->dma_address, sg->length, direction);
 	}
 
 	return nents;
 }
 EXPORT_SYMBOL(dma_map_sg);
 
-void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
-	     enum dma_data_direction direction)
+void dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents,
+		  enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
+	int i;
+
 	BUG_ON(!valid_dma_direction(direction));
+	for_each_sg(sglist, sg, nents, i) {
+		sg->dma_address = sg_phys(sg);
+		__dma_complete_pa_range(sg->dma_address, sg->length,
+					direction);
+	}
 }
 EXPORT_SYMBOL(dma_unmap_sg);
 
@@ -164,16 +267,17 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page,
 	BUG_ON(!valid_dma_direction(direction));
 
 	BUG_ON(offset + size > PAGE_SIZE);
-	homecache_flush_cache(page, 0);
-
+	__dma_prep_page(page, offset, size, direction);
 	return page_to_pa(page) + offset;
 }
 EXPORT_SYMBOL(dma_map_page);
 
 void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-	       enum dma_data_direction direction)
+		    enum dma_data_direction direction)
 {
 	BUG_ON(!valid_dma_direction(direction));
+	__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
+			    dma_address & PAGE_OFFSET, size, direction);
 }
 EXPORT_SYMBOL(dma_unmap_page);
 
@@ -181,33 +285,33 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
 			     size_t size, enum dma_data_direction direction)
 {
 	BUG_ON(!valid_dma_direction(direction));
+	__dma_complete_pa_range(dma_handle, size, direction);
 }
 EXPORT_SYMBOL(dma_sync_single_for_cpu);
 
 void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
 				size_t size, enum dma_data_direction direction)
 {
-	unsigned long start = PFN_DOWN(dma_handle);
-	unsigned long end = PFN_DOWN(dma_handle + size - 1);
-	unsigned long i;
-
-	BUG_ON(!valid_dma_direction(direction));
-	for (i = start; i <= end; ++i)
-		homecache_flush_cache(pfn_to_page(i), 0);
+	__dma_prep_pa_range(dma_handle, size, direction);
 }
 EXPORT_SYMBOL(dma_sync_single_for_device);
 
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-		    enum dma_data_direction direction)
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
+			 int nelems, enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
+	int i;
+
 	BUG_ON(!valid_dma_direction(direction));
-	WARN_ON(nelems == 0 || sg[0].length == 0);
+	WARN_ON(nelems == 0 || sglist->length == 0);
+
+	for_each_sg(sglist, sg, nelems, i) {
+		dma_sync_single_for_cpu(dev, sg->dma_address,
+					sg_dma_len(sg), direction);
+	}
 }
 EXPORT_SYMBOL(dma_sync_sg_for_cpu);
 
-/*
- * Flush and invalidate cache for scatterlist.
- */
 void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
 			    int nelems, enum dma_data_direction direction)
 {
@@ -242,8 +346,8 @@ void dma_sync_single_range_for_device(struct device *dev,
 EXPORT_SYMBOL(dma_sync_single_range_for_device);
 
 /*
- * dma_alloc_noncoherent() returns non-cacheable memory, so there's no
- * need to do any flushing here.
+ * dma_alloc_noncoherent() is #defined to return coherent memory,
+ * so there's no need to do any flushing here.
  */
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		    enum dma_data_direction direction)
author	Chris Metcalf <cmetcalf@tilera.com>	2012-06-13 14:46:40 -0400
committer	Chris Metcalf <cmetcalf@tilera.com>	2012-07-18 16:40:05 -0400
commit	bbaa22c3a0d0be4406d26e5a73d1e8e504787986 (patch)
tree	4d00f1bda85d9735c60d7db1cdbdd215d5317ae4 /arch/tile/kernel
parent	3e219b91533058e242b78ac08aaa91024dd6f369 (diff)