aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlias Apalodimas <ilias.apalodimas@linaro.org>2017-10-11 22:03:28 +0300
committerIlias Apalodimas <ilias.apalodimas@linaro.org>2017-10-11 22:03:28 +0300
commit1446c4f775dfd242edcc1120e26baddde2a09f73 (patch)
treef36a3a330f8128d71bc394beb1a54fe2467140ba
initial import
Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
-rw-r--r--.gitignore4
-rw-r--r--Makefile8
-rw-r--r--README3
-rw-r--r--TODO4
-rw-r--r--aa7
-rw-r--r--api/vfio_api.c231
-rwxr-xr-xdrivers/a.outbin0 -> 13624 bytes
-rw-r--r--drivers/r8169-orig.c567
-rw-r--r--drivers/r8169.c312
-rw-r--r--include/drivers/r8169.h235
-rw-r--r--include/vfio_api.h11
-rw-r--r--patches/vf-netmdev.patch642
-rwxr-xr-xrun.sh52
13 files changed, 2076 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6802ab9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.o
+.tmp_versions/
+tags
+r8169
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..118be80
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,8 @@
+CC=gcc
+CFLAGS=-Iinclude/ -Wall -Werror -Wunused
+
+all: drivers/r8169.o api/vfio_api.o
+ #$(CC) drivers/r8169.c -o r8169 $(CFLAGS)
+ $(CC) drivers/r8169.o api/vfio_api.o -o r8169 $(CFLAGS)
+clean:
+ rm -f r8169 && rm `find -name *.o` -f
diff --git a/README b/README
new file mode 100644
index 0000000..8893e83
--- /dev/null
+++ b/README
@@ -0,0 +1,3 @@
+- apply kernel patch and recompile, should support every upstream kernel >
+ 4.10.x
+-
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..400e5cf
--- /dev/null
+++ b/TODO
@@ -0,0 +1,4 @@
+- Measure streaming vs uncached performance?
+- Prefer cached memory for streaming DMA to userspace? https://aelseb.wordpress.com/2015/04/11/contiguous-memory-on-arm-and-cache-coherency/
+- Check IOMMU existence and use vmalloc instead of kmalloc for dma_map_*()?
+- Invalidate caches? dma_sync_single_*() not used.
diff --git a/aa b/aa
new file mode 100644
index 0000000..06f6ba6
--- /dev/null
+++ b/aa
@@ -0,0 +1,7 @@
+# ff
+sudo sh -c "echo ad28d022-ae90-11e7-b712-2bdaf6e1af1c > /sys/class/net/enp4s0/device/mdev_supported_types/r8169-vfnetdev/create"
+sudo sh -c "echo enp4s0 > /sys/bus/mdev/devices/ad28d022-ae90-11e7-b712-2bdaf6e1af1c/vfnetdev/netdev"
+
+#apalos
+sudo sh -c "echo 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 > /sys/class/net/enp4s0/mdev_supported_types/net-vfnetdev/create"
+
diff --git a/api/vfio_api.c b/api/vfio_api.c
new file mode 100644
index 0000000..f6674fd
--- /dev/null
+++ b/api/vfio_api.c
@@ -0,0 +1,231 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <linux/vfio.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+static const char *vfio_fail_str[] = {
+ [VFIO_CHECK_EXTENSION] = "Doesn't support the IOMMU driver we want",
+ [VFIO_GROUP_GET_STATUS] = "Can't get status",
+ [VFIO_GROUP_SET_CONTAINER] = "Failed to set container",
+ [VFIO_SET_IOMMU] "Failed to set IOMMU",
+ [VFIO_IOMMU_GET_INFO] = "Failed to get IOMMU info",
+ [VFIO_GROUP_GET_DEVICE_FD] = "Failed to get device FD",
+ [VFIO_DEVICE_GET_INFO] = "Failed to get device info",
+ [VFIO_DEVICE_GET_REGION_INFO] = "Failed to get PCI region info",
+};
+
+static void vfio_print_fail(int reason)
+{
+ if (reason > ARRAY_SIZE(vfio_fail_str))
+ printf("Unknown\n");
+ else
+ printf("%s\n", vfio_fail_str[reason]);
+}
+
+/*
+ * returns a valid container
+ * fd must be close by caller
+ */
+int get_container(void)
+{
+ int ret;
+ int container;
+ /* Create a new container */
+ container = open("/dev/vfio/vfio", O_RDWR);
+
+ if (container < 0)
+ return container;
+
+ ret = ioctl(container, VFIO_GET_API_VERSION);
+ if (ret != VFIO_API_VERSION) {
+ printf("Unknown API version\n");
+ goto out;
+ }
+
+ if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+ printf("Doesn't support the IOMMU driver we want\n");
+ goto out;
+ }
+
+ return container;
+out:
+ close(container);
+ container = -1;
+ return ret;
+
+}
+
+/*
+ * returns a valid group
+ * fd must be close by caller
+ */
+int get_group(int grp_id)
+{
+ char path[64];
+ int ret;
+ int group;
+ struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
+
+ snprintf(path, sizeof(path), "/dev/vfio/%d", grp_id);
+ group = open(path, O_RDWR);
+ if (group < 0) {
+ printf("Failed to open %s, %d (%s)\n",
+ path, group, strerror(errno));
+ return group;
+ }
+
+ ret = ioctl(group, VFIO_GROUP_GET_STATUS, &group_status);
+
+ if (ret) {
+ printf("ioctl(VFIO_GROUP_GET_STATUS) failed\n");
+ goto out;
+ }
+
+ /* Test the group is viable and available */
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ printf("Group is not viable\n");
+ goto out;
+ }
+
+ return group;
+out:
+ close(group);
+ group = -1;
+ return ret;
+}
+
+/*
+ * @fd: container fd
+ * @sz: requested size
+ * @vaddr: virtual address
+ */
+int dma_map_type1(int fd, unsigned long sz, void **vaddr, uint64_t iova)
+{
+ int ret;
+ struct vfio_iommu_type1_dma_map dma_map;
+
+ /* Allocate some space and setup a DMA mapping */
+ *vaddr = mmap(NULL, (size_t)sz, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (*vaddr == MAP_FAILED) {
+ printf("Failed to map memory\n");
+ return -ENOMEM;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(dma_map);
+ dma_map.vaddr = (unsigned long)*vaddr;
+ dma_map.size = sz;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret)
+ printf("Failed to map DMA memory (%s)\n", strerror(errno));
+
+ return ret;
+}
+
+int dma_unmap_type1(int fd, unsigned long sz, void *vaddr, uint64_t iova)
+{
+ int ret;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(dma_unmap);
+ dma_unmap.size = sz;
+ dma_unmap.iova = iova;
+ ret = ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap);
+ if (ret)
+ printf("Failed to unmap DMA memory (%s)\n", strerror(errno));
+
+ ret = munmap(vaddr, (size_t)sz);
+ if (vaddr == MAP_FAILED) {
+ printf("Failed to unmap memory\n");
+ return -ENOMEM;
+ }
+
+ return ret;
+}
+
+int vfio_init_dev(int grp, int container, struct vfio_group_status *grp_status,
+ struct vfio_iommu_type1_info *iommu_info,
+ struct vfio_device_info *dev_info,
+ struct vfio_region_info *reg_info, char *grp_uuid)
+{
+ int ret;
+ int device;
+
+ ret = ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+ if (!ret) {
+ vfio_print_fail(VFIO_CHECK_EXTENSION);
+ goto out;
+ }
+
+ /* Test the group is viable and available */
+ ret = ioctl(grp, VFIO_GROUP_GET_STATUS, grp_status);
+ if (ret || !(grp_status->flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ vfio_print_fail(VFIO_GROUP_GET_STATUS);
+ goto out;
+
+ }
+
+ ret = ioctl(grp, VFIO_GROUP_SET_CONTAINER, &container);
+ if (ret) {
+ vfio_print_fail(VFIO_GROUP_SET_CONTAINER);
+ printf("Failed to set group container\n");
+ goto out;
+ }
+
+ ret = ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+ if (ret) {
+ vfio_print_fail(VFIO_SET_IOMMU);
+ goto out;
+ }
+
+ ret = ioctl(container, VFIO_IOMMU_GET_INFO, iommu_info);
+ if (ret) {
+ vfio_print_fail(VFIO_IOMMU_GET_INFO);
+ goto out;
+ }
+
+ printf("iova_pgsizes bitmask=0x%llx\n", iommu_info->iova_pgsizes);
+ /* Get a file descriptor for the device */
+ device = ioctl(grp, VFIO_GROUP_GET_DEVICE_FD, grp_uuid);
+ printf("device=%d\n", device);
+ if (device < 0) {
+ vfio_print_fail(VFIO_GROUP_GET_DEVICE_FD);
+ goto out;
+ }
+
+ /* Test and setup the device */
+ ret = ioctl(device, VFIO_DEVICE_GET_INFO, dev_info);
+ if (ret) {
+ vfio_print_fail(VFIO_DEVICE_GET_INFO);
+ goto out;
+ }
+
+ printf("regions=%d irqs=%d\n", dev_info->num_regions, dev_info->num_irqs);
+
+ /* Test and setup the device */
+ reg_info->index = VFIO_PCI_NUM_REGIONS + 1;
+ ret = ioctl(device, VFIO_DEVICE_GET_REGION_INFO, reg_info);
+ if (ret) {
+ vfio_print_fail(VFIO_DEVICE_GET_REGION_INFO);
+ goto out;
+ }
+
+ //if (!reg_info->size) {
+ //printf("Region:%d unimplemented PCI BAR\n", i);
+ //goto out;
+ //}
+
+out:
+ return device;
+}
diff --git a/drivers/a.out b/drivers/a.out
new file mode 100755
index 0000000..24991fb
--- /dev/null
+++ b/drivers/a.out
Binary files differ
diff --git a/drivers/r8169-orig.c b/drivers/r8169-orig.c
new file mode 100644
index 0000000..0aa0b7f
--- /dev/null
+++ b/drivers/r8169-orig.c
@@ -0,0 +1,567 @@
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <linux/types.h>
+#include <endian.h>
+
+#define COMPILER_BARRIER() asm volatile("" ::: "memory")
+#define MEMORY_BARRIER() asm volatile ("mfence" ::: "memory")
+#define STORE_BARRIER() asm volatile ("sfence" ::: "memory")
+#define LOAD_BARRIER() asm volatile ("lfence" ::: "memory")
+#define dma_wmb() STORE_BARRIER()
+#define dma_rmb() LOAD_BARRIER()
+#define unlikely(x) (x)
+
+#define cpu_to_le32(x) htole32(x)
+#define cpu_to_le64(x) htole64(x)
+#define le32_to_cpu(x) le32toh(x)
+
+typedef unsigned long long u64;
+typedef unsigned short u16;
+
+
+
+
+enum rtl_register_content {
+ /* InterruptStatusBits */
+ SYSErr = 0x8000,
+ PCSTimeout = 0x4000,
+ SWInt = 0x0100,
+ TxDescUnavail = 0x0080,
+ RxFIFOOver = 0x0040,
+ LinkChg = 0x0020,
+ RxOverflow = 0x0010,
+ TxErr = 0x0008,
+ TxOK = 0x0004,
+ RxErr = 0x0002,
+ RxOK = 0x0001,
+
+ /* RxStatusDesc */
+ RxBOVF = (1 << 24),
+ RxFOVF = (1 << 23),
+ RxRWT = (1 << 22),
+ RxRES = (1 << 21),
+ RxRUNT = (1 << 20),
+ RxCRC = (1 << 19),
+
+ /* ChipCmdBits */
+ StopReq = 0x80,
+ CmdReset = 0x10,
+ CmdRxEnb = 0x08,
+ CmdTxEnb = 0x04,
+ RxBufEmpty = 0x01,
+
+ /* TXPoll register p.5 */
+ HPQ = 0x80, /* Poll cmd on the high prio queue */
+ NPQ = 0x40, /* Poll cmd on the low prio queue */
+ FSWInt = 0x01, /* Forced software interrupt */
+
+ /* Cfg9346Bits */
+ Cfg9346_Lock = 0x00,
+ Cfg9346_Unlock = 0xc0,
+
+ /* rx_mode_bits */
+ AcceptErr = 0x20,
+ AcceptRunt = 0x10,
+ AcceptBroadcast = 0x08,
+ AcceptMulticast = 0x04,
+ AcceptMyPhys = 0x02,
+ AcceptAllPhys = 0x01,
+#define RX_CONFIG_ACCEPT_MASK 0x3f
+
+ /* TxConfigBits */
+ TxInterFrameGapShift = 24,
+ TxDMAShift = 8, /* DMA burst value (0-7) is shift this many bits */
+
+ /* Config1 register p.24 */
+ LEDS1 = (1 << 7),
+ LEDS0 = (1 << 6),
+ Speed_down = (1 << 4),
+ MEMMAP = (1 << 3),
+ IOMAP = (1 << 2),
+ VPD = (1 << 1),
+ PMEnable = (1 << 0), /* Power Management Enable */
+
+ /* Config2 register p. 25 */
+ ClkReqEn = (1 << 7), /* Clock Request Enable */
+ MSIEnable = (1 << 5), /* 8169 only. Reserved in the 8168. */
+ PCI_Clock_66MHz = 0x01,
+ PCI_Clock_33MHz = 0x00,
+
+ /* Config3 register p.25 */
+ MagicPacket = (1 << 5), /* Wake up when receives a Magic Packet */
+ LinkUp = (1 << 4), /* Wake up when the cable connection is re-established */
+ Jumbo_En0 = (1 << 2), /* 8168 only. Reserved in the 8168b */
+ Rdy_to_L23 = (1 << 1), /* L23 Enable */
+ Beacon_en = (1 << 0), /* 8168 only. Reserved in the 8168b */
+
+ /* Config4 register */
+ Jumbo_En1 = (1 << 1), /* 8168 only. Reserved in the 8168b */
+
+ /* Config5 register p.27 */
+ BWF = (1 << 6), /* Accept Broadcast wakeup frame */
+ MWF = (1 << 5), /* Accept Multicast wakeup frame */
+ UWF = (1 << 4), /* Accept Unicast wakeup frame */
+ Spi_en = (1 << 3),
+ LanWake = (1 << 1), /* LanWake enable/disable */
+ PMEStatus = (1 << 0), /* PME status can be reset by PCI RST# */
+ ASPM_en = (1 << 0), /* ASPM enable */
+
+ /* TBICSR p.28 */
+ TBIReset = 0x80000000,
+ TBILoopback = 0x40000000,
+ TBINwEnable = 0x20000000,
+ TBINwRestart = 0x10000000,
+ TBILinkOk = 0x02000000,
+ TBINwComplete = 0x01000000,
+
+ /* CPlusCmd p.31 */
+ EnableBist = (1 << 15), // 8168 8101
+ Mac_dbgo_oe = (1 << 14), // 8168 8101
+ Normal_mode = (1 << 13), // unused
+ Force_half_dup = (1 << 12), // 8168 8101
+ Force_rxflow_en = (1 << 11), // 8168 8101
+ Force_txflow_en = (1 << 10), // 8168 8101
+ Cxpl_dbg_sel = (1 << 9), // 8168 8101
+ ASF = (1 << 8), // 8168 8101
+ PktCntrDisable = (1 << 7), // 8168 8101
+ Mac_dbgo_sel = 0x001c, // 8168
+ RxVlan = (1 << 6),
+ RxChkSum = (1 << 5),
+ PCIDAC = (1 << 4),
+ PCIMulRW = (1 << 3),
+ INTT_0 = 0x0000, // 8168
+ INTT_1 = 0x0001, // 8168
+ INTT_2 = 0x0002, // 8168
+ INTT_3 = 0x0003, // 8168
+
+ /* rtl8169_PHYstatus */
+ TBI_Enable = 0x80,
+ TxFlowCtrl = 0x40,
+ RxFlowCtrl = 0x20,
+ _1000bpsF = 0x10,
+ _100bps = 0x08,
+ _10bps = 0x04,
+ LinkStatus = 0x02,
+ FullDup = 0x01,
+
+ /* _TBICSRBit */
+ TBILinkOK = 0x02000000,
+
+ /* ResetCounterCommand */
+ CounterReset = 0x1,
+
+ /* DumpCounterCommand */
+ CounterDump = 0x8,
+
+ /* magic enable v2 */
+ MagicPacket_v2 = (1 << 16), /* Wake up when receives a Magic Packet */
+};
+
+enum rtl_desc_bit {
+ /* First doubleword. */
+ DescOwn = (1 << 31), /* Descriptor is owned by NIC */
+ RingEnd = (1 << 30), /* End of descriptor ring */
+ FirstFrag = (1 << 29), /* First segment of a packet */
+ LastFrag = (1 << 28), /* Final segment of a packet */
+};
+
+/* Generic case. */
+enum rtl_tx_desc_bit {
+ /* First doubleword. */
+ TD_LSO = (1 << 27), /* Large Send Offload */
+#define TD_MSS_MAX 0x07ffu /* MSS value */
+
+ /* Second doubleword. */
+ TxVlanTag = (1 << 17), /* Add VLAN tag */
+};
+
+/* 8169, 8168b and 810x except 8102e. */
+enum rtl_tx_desc_bit_0 {
+ /* First doubleword. */
+#define TD0_MSS_SHIFT 16 /* MSS position (11 bits) */
+ TD0_TCP_CS = (1 << 16), /* Calculate TCP/IP checksum */
+ TD0_UDP_CS = (1 << 17), /* Calculate UDP/IP checksum */
+ TD0_IP_CS = (1 << 18), /* Calculate IP checksum */
+};
+
+/* 8102e, 8168c and beyond. */
+enum rtl_tx_desc_bit_1 {
+ /* First doubleword. */
+ TD1_GTSENV4 = (1 << 26), /* Giant Send for IPv4 */
+ TD1_GTSENV6 = (1 << 25), /* Giant Send for IPv6 */
+#define GTTCPHO_SHIFT 18
+#define GTTCPHO_MAX 0x7fU
+
+ /* Second doubleword. */
+#define TCPHO_SHIFT 18
+#define TCPHO_MAX 0x3ffU
+#define TD1_MSS_SHIFT 18 /* MSS position (11 bits) */
+ TD1_IPv6_CS = (1 << 28), /* Calculate IPv6 checksum */
+ TD1_IPv4_CS = (1 << 29), /* Calculate IPv4 checksum */
+ TD1_TCP_CS = (1 << 30), /* Calculate TCP/IP checksum */
+ TD1_UDP_CS = (1 << 31), /* Calculate UDP/IP checksum */
+};
+
+enum rtl_rx_desc_bit {
+ /* Rx private */
+ PID1 = (1 << 18), /* Protocol ID bit 1/2 */
+ PID0 = (1 << 17), /* Protocol ID bit 0/2 */
+
+#define RxProtoUDP (PID1)
+#define RxProtoTCP (PID0)
+#define RxProtoIP (PID1 | PID0)
+#define RxProtoMask RxProtoIP
+
+ IPFail = (1 << 16), /* IP checksum failed */
+ UDPFail = (1 << 15), /* UDP/IP checksum failed */
+ TCPFail = (1 << 14), /* TCP/IP checksum failed */
+ RxVlanTag = (1 << 16), /* VLAN tag available */
+};
+
+#define RsvdMask 0x3fffc000
+
+struct TxDesc {
+ __le32 opts1;
+ __le32 opts2;
+ __le64 addr;
+};
+
+struct RxDesc {
+ __le32 opts1;
+ __le32 opts2;
+ __le64 addr;
+};
+
+char* buffers[256];
+typedef unsigned long dma_addr_t;
+typedef unsigned int u32;
+
+int print_packet(unsigned char* buffer)
+{
+ int i;
+ //unsigned int* b = (unsigned int*)buffer;
+ printf("%02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x [%04x]: ",
+ buffer[6], buffer[7], buffer[8], buffer[9], buffer[10], buffer[11],
+ buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], buffer[5],
+ be16toh(*((u16*)(&buffer[12])))
+ );
+ for (i = 14; i < 32; i++) {
+ printf("%02x", buffer[i]);
+ }
+}
+
+
+typedef struct iomem {
+ u64 vaddr;
+ u64 iova;
+ u64 size;
+} iomem;
+
+#define IOMEM_CHUNKS 4096
+iomem iomemArray[IOMEM_CHUNKS];
+int iomem_count;
+u64 iomem_base;
+u64 iomem_current;
+
+int iomem_init(void)
+{
+ void* tmp;
+ iomem_count = 0;
+ memset(iomemArray, 0, sizeof(iomemArray));
+ iomem_base = 1 * 1024ULL * 1024ULL * 1024ULL * 1024ULL;
+
+ iomem_current = iomem_base;
+
+ /* reserve a 4GB contiguous address space and position it, if possible at 8GB */
+ /* no pages are actually allocated and mapped into this address space */
+ /* it is just making sure that overtime, we'll have 4GB contiguous */
+ tmp = mmap(
+ (void*)iomem_base,
+ 4 * 1024ULL * 1024ULL * 1024ULL,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE,
+ -1, 0
+ );
+ if (tmp == NULL) {
+ printf("Could not reserve a contiguous 4GB address space\n");
+ return -1;
+ }
+ iomem_base = (u64)tmp;
+ iomem_current = iomem_base;
+}
+
+struct iomem* iomem_alloc(int device, unsigned int size)
+{
+ void* tmp;
+ int ret;
+ u64 location;
+ struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
+
+ if (size >= 32 * 1024 * 1024) return NULL;
+ if ((size & 0xFFF) != 0) return NULL; /* size should be a 4K aligned quantity */
+ if (iomem_count >= IOMEM_CHUNKS) return NULL;
+
+ /* get a portion of the 4GB window created at init time */
+ tmp = mmap(
+ (void*)iomem_current,
+ size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED,
+ -1, 0
+ );
+ if (tmp == NULL) return NULL;
+ if (iomem_base == 0) {
+ iomem_base = (u64)tmp;
+ iomem_current = iomem_base + size;
+ location = iomem_base;
+ }
+ else {
+ location = iomem_current;
+ iomem_current += size;
+ }
+
+ iomemArray[iomem_count].vaddr = location;
+ iomemArray[iomem_count].size = size;
+
+ dma_map.vaddr = iomemArray[iomem_count].vaddr;
+ dma_map.size = iomemArray[iomem_count].size;
+
+ ret = ioctl(device, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret != 0) return NULL;
+ /* the kernel has filled dma_map.iova with the corresponding allocated IOVA */
+ iomemArray[iomem_count].iova = dma_map.iova;
+
+ printf("iomem_alloc: VA(%llx) -> physmem(%dKB) <- IOVA(%llx)\n",
+ iomemArray[iomem_count].vaddr, size/1024, iomemArray[iomem_count].iova
+ );
+
+ return &iomemArray[iomem_count++];
+}
+
+
+static inline void rtl8169_mark_to_asic(struct RxDesc *desc, u32 rx_buf_sz)
+{
+ u32 eor = le32_to_cpu(desc->opts1) & RingEnd;
+
+ /* Force memory writes to complete before releasing descriptor */
+ dma_wmb();
+
+ desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz);
+}
+
+static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
+ u32 rx_buf_sz)
+{
+ desc->addr = cpu_to_le64(mapping);
+ rtl8169_mark_to_asic(desc, rx_buf_sz);
+}
+
+char* rxBuffers[256];
+
+static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc)
+{
+ desc->opts1 |= cpu_to_le32(RingEnd);
+}
+
+int rtl8169_rx_fill(int device, struct RxDesc* rxRing)
+{
+ int i;
+ struct iomem* packetArea;
+
+ packetArea = iomem_alloc(device, 2 * 1024 * 1024);
+ if (packetArea == NULL) return -1;
+
+ for (i = 0; i < 256; i++)
+ {
+ rtl8169_map_to_asic(&rxRing[i], packetArea->iova + i * 2048, 2048);
+ rxBuffers[i] = (char*)(packetArea->vaddr + i * 2048);
+ }
+ rtl8169_mark_as_last_descriptor(&rxRing[255]);
+ return 0;
+}
+
+int main(int argc, char* argv[])
+{
+ int container, group, parent, device, i;
+
+ struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
+ struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ struct vfio_region_info region_info = { .argsz = sizeof(region_info) };
+ struct RxDesc* rxRing;
+
+ iomem_init();
+
+ /* Create a new container */
+ container = open("/dev/vfio/vfio", O_RDWR);
+
+ if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
+ printf("Unknown API version\n");
+ return -1;
+ }
+
+ if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+ printf("Doesn't support the IOMMU driver we want\n");
+ return -1;
+ }
+
+ group = open(argv[1], O_RDWR);
+ /* Test the group is viable and available */
+ ioctl(group, VFIO_GROUP_GET_STATUS, &group_status);
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ printf(" Group is not viable\n");
+ return -1;
+ }
+
+ ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
+
+ ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+
+ ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);
+
+ printf("iova_pgsizes bitmask=0x%llx\n", iommu_info.iova_pgsizes);
+ /* Get a file descriptor for the device */
+ printf("AAAA %s\n", argv[2]);
+ device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, argv[2]);
+
+ printf("device=%d\n", device);
+ if (device <= 0) return -3;
+
+ /* Test and setup the device */
+ ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
+
+ printf("regions=%d irqs=%d\n", device_info.num_regions, device_info.num_irqs);
+
+ /* Test and setup the device */
+ ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
+
+ region_info.index = VFIO_PCI_NUM_REGIONS + 1;
+ if (ioctl(device, VFIO_DEVICE_GET_REGION_INFO, &region_info))
+ return -1;
+ if (!region_info.size) {
+ //printf("Region:%d unimplemented PCI BAR\n", i);
+ return -2;
+ }
+
+ printf("Region:%d size %llu, offset 0x%llx, flags 0x%x\n", i,
+ region_info.size,
+ region_info.offset, region_info.flags);
+
+ rxRing = mmap(NULL, region_info.size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, device, region_info.offset);
+
+ if (rtl8169_rx_fill(device, rxRing) != 0) {
+ printf("Could not fill ring\n");
+ return -1;
+ }
+
+ /* signal ready */
+ ioctl(device, 500, NULL);
+
+ i = 0;
+ while (1)
+ {
+ if (i >= 256) i = 0;
+ for (; i < 256; i++)
+ {
+ u32 status;
+
+ status = le32_to_cpu(rxRing[i].opts1) & ~0; /// either ~(RxBOVF | RxFOVF) or ~0;
+
+ if (status & DescOwn) {
+ usleep(100*1000);
+ break;
+ }
+
+ /* This barrier is needed to keep us from reading
+ * any other fields out of the Rx descriptor until
+ * we know the status of DescOwn
+ */
+ dma_rmb();
+
+ if (unlikely(status & RxRES)) {
+ printf("Rx ERROR. status = %08x\n",status);
+ /*
+ dev->stats.rx_errors++;
+ if (status & (RxRWT | RxRUNT))
+ dev->stats.rx_length_errors++;
+ if (status & RxCRC)
+ dev->stats.rx_crc_errors++;
+ if (status & RxFOVF) {
+ rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
+ dev->stats.rx_fifo_errors++;
+ }
+ */
+ if ((status & (RxRUNT | RxCRC)) &&
+ !(status & (RxRWT | RxFOVF))
+ /* && (dev->features & NETIF_F_RXALL) */
+ )
+ goto process_pkt;
+ }
+ else {
+ //dma_addr_t addr;
+ int pkt_size;
+
+ process_pkt:
+ //addr = le64_to_cpu(rxRing[i].addr);
+ if (1) // likely(!(dev->features & NETIF_F_RXFCS)))
+ pkt_size = (status & 0x00003fff) - 4;
+ else
+ pkt_size = status & 0x00003fff;
+
+ /*
+ * The driver does not support incoming fragmented
+ * frames. They are seen as a symptom of over-mtu
+ * sized frames.
+ */
+ /*
+ if (unlikely(rtl8169_fragmented_frame(status))) {
+ dev->stats.rx_dropped++;
+ dev->stats.rx_length_errors++;
+ goto release_descriptor;
+ }
+
+ skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],
+ tp, pkt_size, addr);
+ if (!skb) {
+ dev->stats.rx_dropped++;
+ goto release_descriptor;
+ }
+
+ rtl8169_rx_csum(skb, status);
+ skb_put(skb, pkt_size);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ rtl8169_rx_vlan_tag(desc, skb);
+
+ if (skb->pkt_type == PACKET_MULTICAST)
+ dev->stats.multicast++;
+
+ napi_gro_receive(&tp->napi, skb);
+
+ u64_stats_update_begin(&tp->rx_stats.syncp);
+ tp->rx_stats.packets++;
+ tp->rx_stats.bytes += pkt_size;
+ u64_stats_update_end(&tp->rx_stats.syncp);
+ */
+ printf("desc[%03d]: size=%5d ", i, pkt_size);
+ print_packet(rxBuffers[i]);
+ printf("\n");
+ }
+ release_descriptor:
+ rxRing[i].opts2 = 0;
+ rtl8169_mark_to_asic(&rxRing[i], 2048);
+ }
+
+ }
+
+
+}
diff --git a/drivers/r8169.c b/drivers/r8169.c
new file mode 100644
index 0000000..64b1d3a
--- /dev/null
+++ b/drivers/r8169.c
@@ -0,0 +1,312 @@
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <linux/vfio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <stdlib.h>
+//#include <linux/types.h>
+
+/* Us */
+#include <drivers/r8169.h>
+#include <vfio_api.h>
+
+typedef unsigned long dma_addr_t;
+char *rxBuffers[256];
+
+static void print_packet(unsigned char *buffer)
+{
+ int i;
+ //unsigned int* b = (unsigned int*)buffer;
+ printf("%02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x [%04x]:",
+ buffer[6], buffer[7], buffer[8], buffer[9], buffer[10], buffer[11],
+ buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], buffer[5],
+ be16toh(*((__u16*)(&buffer[12])))
+ );
+
+ for (i = 14; i < 32; i++) {
+ printf("%02x", buffer[i]);
+ }
+}
+
+#define IOMEM_CHUNKS 4096
+iomem iomem_array[IOMEM_CHUNKS];
+int iomem_count;
+__u64 iomem_base;
+__u64 iomem_current;
+
+int iomem_init(void)
+{
+ void *tmp;
+ iomem_count = 0;
+ memset(iomem_array, 0, sizeof(iomem_array));
+ iomem_base = 1 * 1024ULL * 1024ULL * 1024ULL * 1024ULL;
+
+ iomem_current = iomem_base;
+ /* reserve a 4GB contiguous address space and position it, if possible at 8GB */
+ /* no pages are actually allocated and mapped into this address space */
+ /* it is just making sure that overtime, we'll have 4GB contiguous */
+ tmp = mmap((void*)iomem_base, 4 * 1024ULL * 1024ULL * 1024ULL,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS |
+ MAP_NORESERVE, -1, 0);
+ if (tmp == MAP_FAILED) {
+ printf("Could not reserve a contiguous 4GB address space\n");
+ return -1;
+ }
+ iomem_base = (__u64)tmp;
+ iomem_current = iomem_base;
+
+ return 0;
+}
+
+struct iomem *iomem_alloc(int device, unsigned int size)
+{
+ void *tmp;
+ int ret;
+ __u64 location;
+ struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
+
+ if (size >= 32 * 1024 * 1024)
+ return NULL;
+ if ((size & 0xFFF) != 0)
+ return NULL; /* size should be a 4K aligned quantity */
+ if (iomem_count >= IOMEM_CHUNKS)
+ return NULL;
+
+ /* get a portion of the 4GB window created at init time */
+ tmp = mmap((void*)iomem_current, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, -1,
+ 0);
+ if (tmp == MAP_FAILED)
+ return NULL;
+ if (iomem_base == 0) {
+ iomem_base = (__u64)tmp;
+ iomem_current = iomem_base + size;
+ location = iomem_base;
+ } else {
+ location = iomem_current;
+ iomem_current += size;
+ }
+
+ iomem_array[iomem_count].vaddr = location;
+ iomem_array[iomem_count].size = size;
+
+ dma_map.vaddr = iomem_array[iomem_count].vaddr;
+ dma_map.size = iomem_array[iomem_count].size;
+
+ /* kernel driver fills dma_map.iova with the proper allocated IOVA */
+ ret = ioctl(device, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret != 0)
+ return NULL;
+ iomem_array[iomem_count].iova = dma_map.iova;
+
+ printf("iomem_alloc: VA(%llx) -> physmem(%dKB) <- IOVA(%llx)\n",
+ iomem_array[iomem_count].vaddr, size/1024,
+ iomem_array[iomem_count].iova);
+
+ return &iomem_array[iomem_count++];
+}
+
+static inline void rtl8169_mark_to_asic(struct RxDesc *desc, __u32 rx_buf_sz)
+{
+ __u32 eor = le32_to_cpu(desc->opts1) & RingEnd;
+
+ /* Force memory writes to complete before releasing descriptor */
+ dma_wmb();
+
+ desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz);
+}
+
+static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
+ __u32 rx_buf_sz)
+{
+ desc->addr = cpu_to_le64(mapping);
+ rtl8169_mark_to_asic(desc, rx_buf_sz);
+}
+
+
+static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc)
+{
+ desc->opts1 |= cpu_to_le32(RingEnd);
+}
+
+int rtl8169_rx_fill(int device, struct RxDesc* rxRing)
+{
+ int i;
+ struct iomem *packetArea;
+
+ packetArea = iomem_alloc(device, 2 * 1024 * 1024);
+ if (packetArea == NULL)
+ return -1;
+
+ for (i = 0; i < NUM_RX_DESC; i++) {
+ rtl8169_map_to_asic(&rxRing[i], packetArea->iova + i * 2048,
+ 2048);
+ rxBuffers[i] = (char*)(packetArea->vaddr + i * 2048);
+ }
+ rtl8169_mark_as_last_descriptor(&rxRing[255]);
+
+ return 0;
+}
+
+void usage(char *name)
+{
+ printf("usage: %s <group id>\n", name);
+}
+
+int main(int argc, char* argv[])
+{
+ int container, group, device, i = 0;
+ int group_id;
+ char group_uuid[128];
+ struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
+ struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ struct vfio_region_info region_info = { .argsz = sizeof(region_info) };
+ struct RxDesc* rxRing;
+
+ if (argc != 3) {
+ usage(argv[0]);
+ return -1;
+ }
+ iomem_init();
+
+ /* Create a new container */
+ container = get_container();
+
+ if (container < 0)
+ goto out;
+
+ group_id = atoi(argv[1]);
+ group = get_group(group_id);
+ if (group < 0)
+ goto out;
+
+ strncpy(group_uuid, argv[2], sizeof(group_uuid));
+ device = vfio_init_dev(group, container, &group_status, &iommu_info,
+ &device_info, &region_info, group_uuid);
+
+ printf("Region:%d size %llu, offset 0x%llx, flags 0x%x\n", i,
+ region_info.size, region_info.offset, region_info.flags);
+
+ rxRing = mmap(NULL, region_info.size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, device, region_info.offset);
+ if (rxRing == MAP_FAILED) {
+ printf("Could not reserve on contiguous 4GB address space\n");
+ return -1;
+ }
+
+ if (rtl8169_rx_fill(device, rxRing) != 0) {
+ printf("Could not fill ring\n");
+ return -1;
+ }
+
+ /* signal ready */
+ ioctl(device, 500, NULL);
+ i = 0;
+ while (1)
+ {
+ if (i >= NUM_RX_DESC)
+ i = 0;
+ for (; i < NUM_RX_DESC; i++)
+ {
+ __u32 status;
+
+ status = le32_to_cpu(rxRing[i].opts1) & ~0; /// either ~(RxBOVF | RxFOVF) or ~0;
+
+ if (status & DescOwn) {
+ usleep(100*1000);
+ break;
+ }
+
+ /* This barrier is needed to keep us from reading
+ * any other fields out of the Rx descriptor until
+ * we know the status of DescOwn
+ */
+ dma_rmb();
+
+ if (unlikely(status & RxRES)) {
+ printf("Rx ERROR. status = %08x\n",status);
+ /*
+ dev->stats.rx_errors++;
+ if (status & (RxRWT | RxRUNT))
+ dev->stats.rx_length_errors++;
+ if (status & RxCRC)
+ dev->stats.rx_crc_errors++;
+ if (status & RxFOVF) {
+ rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
+ dev->stats.rx_fifo_errors++;
+ }
+ */
+ if ((status & (RxRUNT | RxCRC)) &&
+ !(status & (RxRWT | RxFOVF))
+ /* && (dev->features & NETIF_F_RXALL) */
+ )
+ goto process_pkt;
+ }
+ else {
+ //dma_addr_t addr;
+ int pkt_size;
+ process_pkt:
+ //addr = le64_to_cpu(rxRing[i].addr);
+ if (1) // likely(!(dev->features & NETIF_F_RXFCS)))
+ pkt_size = (status & 0x00003fff) - 4;
+ else
+ pkt_size = status & 0x00003fff;
+
+ /*
+ * The driver does not support incoming fragmented
+ * frames. They are seen as a symptom of over-mtu
+ * sized frames.
+ */
+ /*
+ if (unlikely(rtl8169_fragmented_frame(status))) {
+ dev->stats.rx_dropped++;
+ dev->stats.rx_length_errors++;
+ goto release_descriptor;
+ }
+
+ skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],
+ tp, pkt_size, addr);
+ if (!skb) {
+ dev->stats.rx_dropped++;
+ goto release_descriptor;
+ }
+
+ rtl8169_rx_csum(skb, status);
+ skb_put(skb, pkt_size);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ rtl8169_rx_vlan_tag(desc, skb);
+
+ if (skb->pkt_type == PACKET_MULTICAST)
+ dev->stats.multicast++;
+
+ napi_gro_receive(&tp->napi, skb);
+
+ __u64_stats_update_begin(&tp->rx_stats.syncp);
+ tp->rx_stats.packets++;
+ tp->rx_stats.bytes += pkt_size;
+ __u64_stats_update_end(&tp->rx_stats.syncp);
+ */
+ printf("desc[%03d]: size= %5d ", i, pkt_size);
+ print_packet((unsigned char *)rxBuffers[i]);
+ printf("\n");
+ }
+ /*release_descriptor: */
+ rxRing[i].opts2 = 0;
+ rtl8169_mark_to_asic(&rxRing[i], 2048);
+ }
+
+ }
+
+out:
+ close(group);
+ close(container);
+ return -1;
+
+}
diff --git a/include/drivers/r8169.h b/include/drivers/r8169.h
new file mode 100644
index 0000000..1b125f2
--- /dev/null
+++ b/include/drivers/r8169.h
@@ -0,0 +1,235 @@
+#ifndef _R8169_H_
+#define _R8169_H_
+#define COMPILER_BARRIER() asm volatile("" ::: "memory")
+#define MEMORY_BARRIER() asm volatile ("mfence" ::: "memory")
+#define STORE_BARRIER() asm volatile ("sfence" ::: "memory")
+#define LOAD_BARRIER() asm volatile ("lfence" ::: "memory")
+#define dma_wmb() STORE_BARRIER()
+#define dma_rmb() LOAD_BARRIER()
+#define unlikely(x) (x)
+
+#define cpu_to_le32(x) htole32(x)
+#define cpu_to_le64(x) htole64(x)
+#define le32_to_cpu(x) le32toh(x)
+
+#define NUM_TX_DESC 64 /* Number of Tx descriptor registers */
+#define NUM_RX_DESC 256U /* Number of Rx descriptor registers */
+#define R8169_TX_RING_BYTES (NUM_TX_DESC * sizeof(struct TxDesc))
+#define R8169_RX_RING_BYTES (NUM_RX_DESC * sizeof(struct RxDesc))
+
+typedef struct iomem {
+ __u64 vaddr;
+ __u64 iova;
+ __u64 size;
+} iomem;
+
+/* drivers/ethernet/realtek/r8169.c */
+enum rtl_register_content {
+ /* InterruptStatusBits */
+ SYSErr = 0x8000,
+ PCSTimeout = 0x4000,
+ SWInt = 0x0100,
+ TxDescUnavail = 0x0080,
+ RxFIFOOver = 0x0040,
+ LinkChg = 0x0020,
+ RxOverflow = 0x0010,
+ TxErr = 0x0008,
+ TxOK = 0x0004,
+ RxErr = 0x0002,
+ RxOK = 0x0001,
+
+ /* RxStatusDesc */
+ RxBOVF = (1 << 24),
+ RxFOVF = (1 << 23),
+ RxRWT = (1 << 22),
+ RxRES = (1 << 21),
+ RxRUNT = (1 << 20),
+ RxCRC = (1 << 19),
+
+ /* ChipCmdBits */
+ StopReq = 0x80,
+ CmdReset = 0x10,
+ CmdRxEnb = 0x08,
+ CmdTxEnb = 0x04,
+ RxBufEmpty = 0x01,
+
+ /* TXPoll register p.5 */
+ HPQ = 0x80, /* Poll cmd on the high prio queue */
+ NPQ = 0x40, /* Poll cmd on the low prio queue */
+ FSWInt = 0x01, /* Forced software interrupt */
+
+ /* Cfg9346Bits */
+ Cfg9346_Lock = 0x00,
+ Cfg9346_Unlock = 0xc0,
+
+ /* rx_mode_bits */
+ AcceptErr = 0x20,
+ AcceptRunt = 0x10,
+ AcceptBroadcast = 0x08,
+ AcceptMulticast = 0x04,
+ AcceptMyPhys = 0x02,
+ AcceptAllPhys = 0x01,
+#define RX_CONFIG_ACCEPT_MASK 0x3f
+
+ /* TxConfigBits */
+ TxInterFrameGapShift = 24,
+ TxDMAShift = 8, /* DMA burst value (0-7) is shift this many bits */
+
+ /* Config1 register p.24 */
+ LEDS1 = (1 << 7),
+ LEDS0 = (1 << 6),
+ Speed_down = (1 << 4),
+ MEMMAP = (1 << 3),
+ IOMAP = (1 << 2),
+ VPD = (1 << 1),
+ PMEnable = (1 << 0), /* Power Management Enable */
+
+ /* Config2 register p. 25 */
+ ClkReqEn = (1 << 7), /* Clock Request Enable */
+ MSIEnable = (1 << 5), /* 8169 only. Reserved in the 8168. */
+ PCI_Clock_66MHz = 0x01,
+ PCI_Clock_33MHz = 0x00,
+
+ /* Config3 register p.25 */
+ MagicPacket = (1 << 5), /* Wake up when receives a Magic Packet */
+ LinkUp = (1 << 4), /* Wake up when the cable connection is re-established */
+ Jumbo_En0 = (1 << 2), /* 8168 only. Reserved in the 8168b */
+ Rdy_to_L23 = (1 << 1), /* L23 Enable */
+ Beacon_en = (1 << 0), /* 8168 only. Reserved in the 8168b */
+
+ /* Config4 register */
+ Jumbo_En1 = (1 << 1), /* 8168 only. Reserved in the 8168b */
+
+ /* Config5 register p.27 */
+ BWF = (1 << 6), /* Accept Broadcast wakeup frame */
+ MWF = (1 << 5), /* Accept Multicast wakeup frame */
+ UWF = (1 << 4), /* Accept Unicast wakeup frame */
+ Spi_en = (1 << 3),
+ LanWake = (1 << 1), /* LanWake enable/disable */
+ PMEStatus = (1 << 0), /* PME status can be reset by PCI RST# */
+ ASPM_en = (1 << 0), /* ASPM enable */
+
+ /* TBICSR p.28 */
+ TBIReset = 0x80000000,
+ TBILoopback = 0x40000000,
+ TBINwEnable = 0x20000000,
+ TBINwRestart = 0x10000000,
+ TBILinkOk = 0x02000000,
+ TBINwComplete = 0x01000000,
+
+ /* CPlusCmd p.31 */
+ EnableBist = (1 << 15), // 8168 8101
+ Mac_dbgo_oe = (1 << 14), // 8168 8101
+ Normal_mode = (1 << 13), // unused
+ Force_half_dup = (1 << 12), // 8168 8101
+ Force_rxflow_en = (1 << 11), // 8168 8101
+ Force_txflow_en = (1 << 10), // 8168 8101
+ Cxpl_dbg_sel = (1 << 9), // 8168 8101
+ ASF = (1 << 8), // 8168 8101
+ PktCntrDisable = (1 << 7), // 8168 8101
+ Mac_dbgo_sel = 0x001c, // 8168
+ RxVlan = (1 << 6),
+ RxChkSum = (1 << 5),
+ PCIDAC = (1 << 4),
+ PCIMulRW = (1 << 3),
+ INTT_0 = 0x0000, // 8168
+ INTT_1 = 0x0001, // 8168
+ INTT_2 = 0x0002, // 8168
+ INTT_3 = 0x0003, // 8168
+
+ /* rtl8169_PHYstatus */
+ TBI_Enable = 0x80,
+ TxFlowCtrl = 0x40,
+ RxFlowCtrl = 0x20,
+ _1000bpsF = 0x10,
+ _100bps = 0x08,
+ _10bps = 0x04,
+ LinkStatus = 0x02,
+ FullDup = 0x01,
+
+ /* _TBICSRBit */
+ TBILinkOK = 0x02000000,
+
+ /* ResetCounterCommand */
+ CounterReset = 0x1,
+
+ /* DumpCounterCommand */
+ CounterDump = 0x8,
+
+ /* magic enable v2 */
+ MagicPacket_v2 = (1 << 16), /* Wake up when receives a Magic Packet */
+};
+
+enum rtl_desc_bit {
+ /* First doubleword. */
+ DescOwn = (1 << 31), /* Descriptor is owned by NIC */
+ RingEnd = (1 << 30), /* End of descriptor ring */
+ FirstFrag = (1 << 29), /* First segment of a packet */
+ LastFrag = (1 << 28), /* Final segment of a packet */
+};
+
+/* Generic case. */
+enum rtl_tx_desc_bit {
+ /* First doubleword. */
+ TD_LSO = (1 << 27), /* Large Send Offload */
+#define TD_MSS_MAX 0x07ffu /* MSS value */
+
+ /* Second doubleword. */
+ TxVlanTag = (1 << 17), /* Add VLAN tag */
+};
+
+/* 8169, 8168b and 810x except 8102e. */
+enum rtl_tx_desc_bit_0 {
+ /* First doubleword. */
+#define TD0_MSS_SHIFT 16 /* MSS position (11 bits) */
+ TD0_TCP_CS = (1 << 16), /* Calculate TCP/IP checksum */
+ TD0_UDP_CS = (1 << 17), /* Calculate UDP/IP checksum */
+ TD0_IP_CS = (1 << 18), /* Calculate IP checksum */
+};
+
+/* 8102e, 8168c and beyond. */
+enum rtl_tx_desc_bit_1 {
+ /* First doubleword. */
+ TD1_GTSENV4 = (1 << 26), /* Giant Send for IPv4 */
+ TD1_GTSENV6 = (1 << 25), /* Giant Send for IPv6 */
+#define GTTCPHO_SHIFT 18
+#define GTTCPHO_MAX 0x7fU
+
+ /* Second doubleword. */
+#define TCPHO_SHIFT 18
+#define TCPHO_MAX 0x3ffU
+#define TD1_MSS_SHIFT 18 /* MSS position (11 bits) */
+ TD1_IPv6_CS = (1 << 28), /* Calculate IPv6 checksum */
+ TD1_IPv4_CS = (1 << 29), /* Calculate IPv4 checksum */
+ TD1_TCP_CS = (1 << 30), /* Calculate TCP/IP checksum */
+ TD1_UDP_CS = (1 << 31), /* Calculate UDP/IP checksum */
+};
+
+enum rtl_rx_desc_bit {
+ /* Rx private */
+ PID1 = (1 << 18), /* Protocol ID bit 1/2 */
+ PID0 = (1 << 17), /* Protocol ID bit 0/2 */
+#define RxProtoUDP (PID1)
+#define RxProtoTCP (PID0)
+#define RxProtoIP (PID1 | PID0)
+#define RxProtoMask RxProtoIP
+ IPFail = (1 << 16), /* IP checksum failed */
+ UDPFail = (1 << 15), /* UDP/IP checksum failed */
+ TCPFail = (1 << 14), /* TCP/IP checksum failed */
+ RxVlanTag = (1 << 16), /* VLAN tag available */
+};
+
+#define RsvdMask 0x3fffc000
+
+struct TxDesc {
+ __le32 opts1;
+ __le32 opts2;
+ __le64 addr;
+};
+
+struct RxDesc {
+ __le32 opts1;
+ __le32 opts2;
+ __le64 addr;
+};
+#endif
diff --git a/include/vfio_api.h b/include/vfio_api.h
new file mode 100644
index 0000000..7485e67
--- /dev/null
+++ b/include/vfio_api.h
@@ -0,0 +1,11 @@
+#ifndef VFIO_API_H
+#define VFIO_API_H
+int dma_map_type1(int fd, unsigned long sz, void **vaddr, uint64_t iova);
+int dma_unmap_type1(int fd, unsigned long sz, void *vaddr, uint64_t iova);
+int get_group(int grp_id);
+int get_container(void);
+int vfio_init_dev(int grp, int container, struct vfio_group_status *grp_status,
+ struct vfio_iommu_type1_info *iommu_info,
+ struct vfio_device_info *dev_info,
+ struct vfio_region_info *reg_info, char *group_uuid);
+#endif
diff --git a/patches/vf-netmdev.patch b/patches/vf-netmdev.patch
new file mode 100644
index 0000000..6a2760b
--- /dev/null
+++ b/patches/vf-netmdev.patch
@@ -0,0 +1,642 @@
+diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
+index e03fcf9..1d37643 100644
+--- a/drivers/net/ethernet/realtek/r8169.c
++++ b/drivers/net/ethernet/realtek/r8169.c
+@@ -33,6 +33,13 @@
+ #include <asm/io.h>
+ #include <asm/irq.h>
+
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++#include <linux/sched/task.h>
++#include <linux/mm.h>
++#include <linux/vfio.h>
++#include <linux/mdev.h>
++#endif
++
+ #define RTL8169_VERSION "2.3LK-NAPI"
+ #define MODULENAME "r8169"
+ #define PFX MODULENAME ": "
+@@ -7393,6 +7400,11 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget
+ unsigned int cur_rx, rx_left;
+ unsigned int count;
+
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++ if (dev->priv_flags & IFF_VFNETDEV)
++ return budget ;
++#endif
++
+ cur_rx = tp->cur_rx;
+
+ for (rx_left = min(budget, NUM_RX_DESC); rx_left > 0; rx_left--, cur_rx++) {
+@@ -7577,6 +7589,11 @@ static int rtl8169_poll(struct napi_struct *napi, int budget)
+ int work_done= 0;
+ u16 status;
+
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++ if (dev->priv_flags & IFF_VFNETDEV)
++ return budget;
++#endif
++
+ status = rtl_get_events(tp);
+ rtl_ack_events(tp, status & ~tp->event_slow);
+
+@@ -7700,11 +7717,19 @@ static int rtl_open(struct net_device *dev)
+ if (!tp->TxDescArray)
+ goto err_pm_runtime_put;
+
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++ printk(KERN_INFO"TxDescArray @%p (%llx), virt_tophys=%llx\n",
++ tp->TxDescArray, tp->TxPhyAddr, virt_to_phys(tp->TxDescArray));
++#endif
+ tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES,
+ &tp->RxPhyAddr, GFP_KERNEL);
+ if (!tp->RxDescArray)
+ goto err_free_tx_0;
+
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++ printk(KERN_INFO"RxDescArray KVA(@%p) -> PA(%llx) <- IOVA(%llx)\n",
++ tp->RxDescArray, virt_to_phys(tp->RxDescArray), tp->RxPhyAddr);
++#endif
+ retval = rtl8169_init_ring(dev);
+ if (retval < 0)
+ goto err_free_rx_1;
+@@ -8008,6 +8033,10 @@ static void rtl_remove_one(struct pci_dev *pdev)
+ struct net_device *dev = pci_get_drvdata(pdev);
+ struct rtl8169_private *tp = netdev_priv(dev);
+
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++ mdev_unregister_device(&pdev->dev);
++#endif
++
+ if ((tp->mac_version == RTL_GIGA_MAC_VER_27 ||
+ tp->mac_version == RTL_GIGA_MAC_VER_28 ||
+ tp->mac_version == RTL_GIGA_MAC_VER_31 ||
+@@ -8191,6 +8220,542 @@ static void rtl_hw_initialize(struct rtl8169_private *tp)
+ }
+ }
+
++
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++struct iovamap {
++ u64 iova;
++ void *vaddr;
++ struct device *dev;
++ u32 size:25; /* maximum of 32MB */
++ u32 direction:2; /* DMA_FROM_DEVICE... */
++};
++
++typedef struct netmdev {
++ union {
++ char page0[4096];
++ struct {
++ struct net_device* netdev;
++ /* FIXME USE A LINKED LIST */
++ int mappings_count;
++ struct iovamap mappings[128]; /* 3.5KB */
++ };
++ };
++ union {
++ /* shadow features & statistics page */
++ char page1[4096];
++ struct {
++ netdev_features_t features;
++ netdev_features_t hw_features;
++ netdev_features_t wanted_features;
++ netdev_features_t vlan_features;
++ netdev_features_t hw_enc_features;
++ netdev_features_t mpls_features;
++ netdev_features_t gso_partial_features;
++ struct net_device_stats stats;
++ atomic_long_t rx_dropped;
++ atomic_long_t tx_dropped;
++ atomic_long_t rx_nohandler;
++ };
++ };
++} netmdev;
++
++/*
++SYSFS structure for the controlling device
++*/
++
++static ssize_t available_instances_show(struct kobject *kobj, struct device *dev,
++ char *buf)
++{
++ return scnprintf(buf, PAGE_SIZE, "%d\n", 1);
++}
++static MDEV_TYPE_ATTR_RO(available_instances);
++
++static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
++ char *buf)
++{
++ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
++}
++static MDEV_TYPE_ATTR_RO(device_api);
++
++static struct attribute *sysfs_vfnetdev_attributes[] = {
++ &mdev_type_attr_device_api.attr,
++ &mdev_type_attr_available_instances.attr,
++ NULL,
++};
++
++static struct attribute_group sysfs_vfnetdev_type = {
++ .name = "vfnetdev",
++ .attrs = sysfs_vfnetdev_attributes,
++};
++
++/* Only 1 supported for now */
++static struct attribute_group *sysfs_type_list[] = {
++ &sysfs_vfnetdev_type,
++ NULL
++};
++
++/*
++ * libraries
++ */
++static struct net_device *netmdev_get_netdev(struct mdev_device* mdev)
++{
++ struct netmdev *netmdev;
++
++ netmdev = mdev_get_drvdata(mdev);
++ if (!netmdev)
++ return NULL;
++
++ return netmdev->netdev;
++}
++
++static void r8169_pause_datapath(struct net_device* netdev)
++{
++ //void __iomem *ioaddr;
++ struct rtl8169_private *tp;
++
++ if (!netdev)
++ return;
++ tp = netdev_priv(netdev);
++ if (!tp)
++ return;
++
++ //ioaddr = tp->mmio_addr;
++ RTL_W8(ChipCmd, RTL_R8(ChipCmd) & ~(CmdTxEnb | CmdRxEnb));
++}
++
++static void r8169_resume_datapath(struct net_device* netdev)
++{
++ //void __iomem *ioaddr;
++ struct rtl8169_private *tp;
++
++ if (!netdev)
++ return;
++
++ tp = netdev_priv(netdev);
++
++ if (!tp)
++ return;
++
++ //ioaddr = tp->mmio_addr;
++ RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb);
++}
++
++static int r8169_get_region(struct net_device* netdev, struct vfio_region_info* info)
++{
++ struct rtl8169_private *tp;
++
++ if (!netdev)
++ return -EINVAL;
++
++ tp = netdev_priv(netdev);
++ if (!tp)
++ return -EFAULT;
++
++ switch (info->index) {
++ case VFIO_PCI_NUM_REGIONS + 1:
++ case VFIO_PCI_NUM_REGIONS + 2:
++ {
++ if (info->index == VFIO_PCI_NUM_REGIONS + 1) {
++ info->offset = (__u64)(tp->RxDescArray);
++ info->size = R8169_RX_RING_BYTES;
++ }
++ else if (info->index == VFIO_PCI_NUM_REGIONS + 2) {
++ info->offset = (__u64)(tp->TxDescArray);
++ info->size = R8169_TX_RING_BYTES;
++ }
++ else return -EINVAL;
++
++ info->flags = VFIO_REGION_INFO_FLAG_MMAP;
++ break;
++ }
++
++ default:
++ return -EINVAL;
++ }
++
++ return 0;
++
++}
++
++/*
++ * SYSFS structure for created mdevices
++ */
++static ssize_t netdev_show(struct device *dev, struct device_attribute *attr,
++ char *buf)
++{
++ struct mdev_device* mdev;
++ struct net_device* netdev;
++
++ mdev = mdev_from_dev(dev);
++ if (!mdev)
++ return scnprintf(buf, PAGE_SIZE, "mdev not found\n");
++
++ netdev = netmdev_get_netdev(mdev);
++ if (!netdev)
++ return scnprintf(buf, PAGE_SIZE, "ndev-mdev not found\n");
++
++ return scnprintf(buf, PAGE_SIZE, "%.16s\n", netdev->name);
++}
++
++static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
++ const char *buf, size_t count)
++{
++ struct mdev_device *mdev;
++ struct net_device *port;
++ struct netmdev *netmdev;
++ char name[IFNAMSIZ+1];
++
++ if (count < 2)
++ return -EINVAL;
++
++ mdev = mdev_from_dev(dev);
++ if (!mdev)
++ return -ENODEV;
++
++ netmdev = mdev_get_drvdata(mdev);
++ if (netmdev)
++ return -ENODEV;
++
++ netmdev = kzalloc(sizeof(*netmdev), GFP_KERNEL);
++ if (!netmdev)
++ return -ENOMEM;
++ mdev_set_drvdata(mdev, netmdev);
++
++ if (count > IFNAMSIZ)
++ return -ENODEV;
++
++ memset(name, 0, sizeof(name));
++ scnprintf(name, IFNAMSIZ + 1, "%.*s", (int)count - 1, buf);
++ port = dev_get_by_name(&init_net, name);
++ if (!port)
++ return -ENODEV;
++
++ /* FIXME find a way to check if this is the parent device */
++ //if (&port->dev != mdev_parent_dev(mdev)) return -1;
++
++ netmdev->netdev = port;
++
++ return count;
++}
++
++static DEVICE_ATTR_RW(netdev);
++static struct attribute *sysfs_mdev_vfnetdev_attributes[] = {
++ &dev_attr_netdev.attr,
++ NULL,
++};
++
++static struct attribute_group sysfs_mdev_vfnetdev_group = {
++ .name = "vfnetdev",
++ .attrs = sysfs_mdev_vfnetdev_attributes,
++};
++
++static const struct attribute_group *sysfs_mdev_groups[] = {
++ &sysfs_mdev_vfnetdev_group,
++ NULL,
++};
++
++
++static int vf_netdev_create(struct kobject *kobj, struct mdev_device *mdev)
++{
++ return 0;
++}
++
++static int vf_netdev_remove(struct mdev_device *mdev)
++{
++ struct netmdev* netmdev = mdev_get_drvdata(mdev);
++ struct net_device* port;
++
++ printk(KERN_INFO"%s %d\n", __func__, __LINE__);
++ port = netmdev_get_netdev(mdev);
++ dev_put(port);
++ kfree(netmdev);
++ mdev_set_drvdata(mdev, NULL);
++
++ return 0;
++}
++
++static int vf_netdev_open(struct mdev_device *mdev)
++{
++ struct netmdev* netmdev = mdev_get_drvdata(mdev);
++ struct net_device* port;
++
++ printk(KERN_INFO"%s %d\n", __func__, __LINE__);
++ /* TODO shadow stats to netmdev */
++ port = netmdev_get_netdev(mdev);
++ r8169_pause_datapath(port);
++ /* barrier required? */
++ port->priv_flags |= IFF_VFNETDEV;
++ /* deallocate kernel buffers from ring */
++ rtl8169_rx_clear(netdev_priv(port));
++
++ return 0;
++}
++
++static void vf_netdev_release(struct mdev_device *mdev)
++{
++ struct netmdev *nd = mdev_get_drvdata(mdev);
++ struct net_device *port;
++ int i;
++
++ if (!nd)
++ return;
++ /* TODO export shadow stats to net_device */
++ printk(KERN_INFO"%s %d\n", __func__, __LINE__);
++ for (i = 0; i < nd->mappings_count; i++, nd->mappings_count--) {
++ dma_unmap_single(nd->mappings[i].dev,
++ nd->mappings[i].iova, nd->mappings[i].size,
++ nd->mappings[i].direction);
++ kfree(nd->mappings[i].vaddr);
++ }
++ port = netmdev_get_netdev(mdev);
++ if (port) {
++ struct rtl8169_private *tp;
++ tp = netdev_priv(port);
++
++ /* replenish the rings with kernel buffers */
++ rtl8169_rx_fill(tp);
++
++ port->priv_flags &= ~IFF_VFNETDEV;
++ /* barrier required? */
++ r8169_resume_datapath(port);
++ }
++
++ return;
++}
++
++static long vf_netdev_ioctl(struct mdev_device *mdev, unsigned int cmd,
++ unsigned long arg)
++{
++ unsigned long minsz;
++ struct net_device *netdev;
++ struct netmdev* netmdev;
++
++ if (!mdev)
++ return -EINVAL;
++
++ netdev = netmdev_get_netdev(mdev);
++ netmdev = mdev_get_drvdata(mdev);
++
++ if (!netdev || !netmdev)
++ return -ENODEV;
++
++ switch (cmd) {
++ case VFIO_DEVICE_GET_INFO:
++ {
++ struct vfio_device_info info;
++
++ minsz = offsetofend(struct vfio_device_info, num_irqs);
++ if (copy_from_user(&info, (void __user *)arg, minsz))
++ return -EFAULT;
++
++ if (info.argsz < minsz)
++ return -EINVAL;
++
++ info.flags = VFIO_DEVICE_FLAGS_PCI;
++ /*
++ * FIXME - find the number of rx queues when not having
++ * CONFIG_SYSFS if not possible to do it in a generic way, plan
++ * for a callback
++ */
++ /* rx_ring and tx_ring*/
++ info.num_regions = VFIO_PCI_NUM_REGIONS + netdev->num_tx_queues + 1;
++ info.num_irqs = 1;
++
++ if (copy_to_user((void __user *)arg, &info, minsz))
++ return -EFAULT;
++
++ return 0;
++ }
++ case VFIO_DEVICE_GET_REGION_INFO:
++ {
++ struct vfio_region_info info;
++ int ret;
++
++ minsz = offsetofend(struct vfio_region_info, offset);
++
++ if (copy_from_user(&info, (void __user *)arg, minsz))
++ return -EFAULT;
++
++ if (info.argsz < minsz)
++ return -EINVAL;
++
++ ret = r8169_get_region(netdev, &info);
++
++ if (ret < 0) return ret;
++
++ if (copy_to_user((void __user *)arg, &info, minsz))
++ return -EFAULT;
++
++ return 0;
++ }
++ case VFIO_IOMMU_MAP_DMA:
++ {
++ struct vfio_iommu_type1_dma_map map;
++ struct vm_area_struct *vma;
++ void *data;
++ struct device* parent_dev;
++ int node;
++ dma_addr_t mapping;
++ int ret = -EINVAL;
++
++ /* allocate DMA area and map it where the userland asks
++ * userland need to mmap an area WITHOUT allocating pages:
++ * mmap(vaddr,size, PROT_READ | PROT_WRITE, MAP_SHARED |
++ * MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, -1, 0
++ * MAP_NORESERVE ensures only VA space is booked, no pages are
++ * mapped * the mapping must be the entire area, not partial on
++ * the vma
++ */
++
++ if (netmdev->mappings_count >= 128)
++ return -EFAULT;
++
++ minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
++
++ if (copy_from_user(&map, (void __user *)arg, minsz)) {
++ ret = -EFAULT;
++ goto out;
++ }
++
++ if (map.argsz < minsz)
++ goto out;
++
++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: find_vma(%llx)\n", map.vaddr);
++ /*
++ * locates the containing vma for the required map.vaddr
++ * the vma must point to the entire zone allocated by mmap in
++ * userland
++ */
++ vma = find_vma(current->mm, map.vaddr);
++ if (!vma)
++ return -EFAULT;
++ if (map.vaddr >= vma->vm_end)
++ return -EFAULT;
++
++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: found vma(%llx) -> start=%lx end=%lx pg_off=%lx\n",
++ map.vaddr, vma->vm_start, vma->vm_end, vma->vm_pgoff);
++ /* the iova will be returned as part of the ioctl to the userland */
++ //parent_dev = &tp->pci_dev->dev;
++ parent_dev = mdev_parent_dev(mdev);
++
++ node = netdev->dev.parent ? dev_to_node(netdev->dev.parent) : -1;
++ data = kmalloc_node(map.size, GFP_KERNEL, node);
++ if (!data)
++ /* return ret? */
++ return -ENOMEM;
++
++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: about to dma_map_single(%p, %p, %lld, DMA_FROM_DEVICE)\n",
++ parent_dev, data, map.size);
++ mapping = dma_map_single(parent_dev, data, map.size,
++ DMA_FROM_DEVICE);
++ if (unlikely(dma_mapping_error(parent_dev, mapping))) {
++ if (net_ratelimit())
++ printk(KERN_ERR"Failed to dma_map_single buffer for userland!\n");
++ kfree(data);
++ goto out;
++ }
++ map.iova = mapping;
++ ret = io_remap_pfn_range(vma, map.vaddr,
++ virt_to_phys(data) >> PAGE_SHIFT,
++ map.size, vma->vm_page_prot);
++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: io_remap_pfn_range %llx -> physmem <- @%llx, %lld:%d\n",
++ map.vaddr, map.iova, map.size, ret);
++ if (ret != 0) {
++ dma_unmap_single(parent_dev, mapping, map.size,
++ DMA_FROM_DEVICE);
++ kfree(data);
++ printk(KERN_ERR"VFIO_IOMMU_MAP_DMA: io_remap_pfn_range failed\n");
++ return -EFAULT;
++ }
++
++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: recording the mapping %d\n",
++ netmdev->mappings_count);
++ netmdev->mappings[netmdev->mappings_count].dev = parent_dev;
++ netmdev->mappings[netmdev->mappings_count].vaddr = data;
++ netmdev->mappings[netmdev->mappings_count].iova = mapping;
++ netmdev->mappings[netmdev->mappings_count].size = map.size;
++ netmdev->mappings_count++;
++
++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: preparing response back to user\n");
++ if (copy_to_user((void __user *)arg, &map, minsz))
++ return -EFAULT;
++
++ ret = 0;
++out:
++ return ret;
++ }
++
++ case 500: {
++ r8169_resume_datapath(netdev);
++ return 0;
++ }
++
++ } /* switch */
++
++ return -EINVAL;
++
++}
++
++static int vf_netdev_mmap(struct mdev_device *mdev,
++ struct vm_area_struct *vma)
++{
++ struct net_device* netdev;
++ struct rtl8169_private *tp;
++ u64 req_len;
++ int ret = 0;
++ /* userland wants to access ring descrptors that was pre-allocated
++ * by the kernel
++ * note: userland need to user IOCTL MAP to CREATE packet buffers
++ */
++ netdev = netmdev_get_netdev(mdev);
++ tp = netdev_priv(netdev);
++
++ /* check that we try to map only authorized areas
++ * FIXME is there a way to check all the transmit and receive rings
++ * from an abstract netdev?
++ */
++ if (vma->vm_pgoff != ((__u64)tp->RxDescArray >> PAGE_SHIFT) &&
++ vma->vm_pgoff != ((__u64)tp->TxDescArray >> PAGE_SHIFT)) {
++ printk(KERN_INFO"invalid address\n");
++ return -EINVAL;
++ }
++
++ req_len = PAGE_ALIGN(vma->vm_end - vma->vm_start);
++
++ vma->vm_private_data = NULL;
++ /* FIXME this should be uncached memory but it sounds the driver does
++ * not map in non cached. strange...
++ */
++ //vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
++
++ ret = remap_pfn_range(vma, vma->vm_start,
++ virt_to_phys((void*)(vma->vm_pgoff << PAGE_SHIFT)) >> PAGE_SHIFT,
++ req_len, vma->vm_page_prot);
++
++ printk(KERN_INFO"vfnetdev_map %lx, @%llx, %lld:%d\n",
++ vma->vm_start, virt_to_phys((void*)(vma->vm_pgoff << PAGE_SHIFT)),
++ req_len, ret);
++
++ return ret;
++}
++
++static const struct mdev_parent_ops vf_netdev_ops = {
++ .supported_type_groups = sysfs_type_list,
++ .mdev_attr_groups = sysfs_mdev_groups,
++ .create = vf_netdev_create,
++ .remove = vf_netdev_remove,
++
++ .open = vf_netdev_open,
++ .release = vf_netdev_release,
++
++ .read = NULL,
++ .write = NULL,
++ .mmap = vf_netdev_mmap,
++ .ioctl = vf_netdev_ioctl,
++};
++
++#endif
++
+ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+ {
+ const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
+@@ -8207,6 +8772,13 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+ MODULENAME, RTL8169_VERSION);
+ }
+
++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE)
++ if (mdev_register_device(&pdev->dev, &vf_netdev_ops) < 0)
++ printk(KERN_ERR"Could not register device\n");
++ else
++ printk(KERN_INFO"Successfully registered vf-netdev device\n");
++#endif
++
+ dev = alloc_etherdev(sizeof (*tp));
+ if (!dev) {
+ rc = -ENOMEM;
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index f535779..8deea1c 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -1386,6 +1386,7 @@ enum netdev_priv_flags {
+ IFF_RXFH_CONFIGURED = 1<<25,
+ IFF_PHONY_HEADROOM = 1<<26,
+ IFF_MACSEC = 1<<27,
++ IFF_VFNETDEV = 1<<28,
+ };
+
+ #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..b0609f5
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+# Only supported driver for now is r8169 ,ake it configurable in the future
+driver='r8169'
+sys_drv_name="$driver"'-vfnetdev'
+intf='enp4s0'
+
+usage() {
+ echo "$0 create/destroy <interface name(optional)>. Default $intf"
+ exit 1
+}
+
+[ $# -lt 1 ] && usage
+[ -n "$2" ] && intf="$2"
+
+echo "Checking for interface $intf"
+if [ ! -e "/sys/class/net/$intf/device/mdev_supported_types/$sys_drv_name/create" ]; then
+ echo "interface $intf has no vfio-mdev support"
+ exit 1
+fi
+
+vf_create() {
+ dev_uuid=$(uuidgen)
+ sudo sh -c "echo $dev_uuid > /sys/class/net/$intf/device/mdev_supported_types/$sys_drv_name/create"
+ #sudo sh -c "echo $dev_uuid > /sys/class/net/$intf/mdev_supported_types/net-vfnetdev/create"
+ #the newly created mdev is not tied to any port of the parent deice yet
+ echo "Bind $intf to the newly created mdevice $dev_uuid"
+ sudo sh -c "echo $intf > /sys/bus/mdev/devices/$dev_uuid/vfnetdev/netdev"
+ #ensure the IOMMU group is readble by non root program
+ vfio_group=$(basename $(readlink /sys/bus/mdev/devices/$dev_uuid/iommu_group))
+ user=$(whoami)
+ grp=$(id -g -n $user)
+ sudo chown "$user":"$grp" /dev/vfio/$vfio_group
+ echo "created $dev_uuid"
+ echo "Run ./r8169 $vfio_group $dev_uuid"
+}
+
+vf_destroy() {
+ # FIXME only one mdev per ethernet supported for now
+ echo 1 > /sys/class/mdev_bus/$intf/*/remove > /dev/null
+}
+
+case "$1" in
+ create)
+ vf_create
+ ;;
+ destroy)
+ vf_destroy
+ ;;
+ *)
+ usage
+ ;;
+esac