From 1446c4f775dfd242edcc1120e26baddde2a09f73 Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Wed, 11 Oct 2017 22:03:28 +0300 Subject: initial import Signed-off-by: Ilias Apalodimas --- .gitignore | 4 + Makefile | 8 + README | 3 + TODO | 4 + aa | 7 + api/vfio_api.c | 231 +++++++++++++++++ drivers/a.out | Bin 0 -> 13624 bytes drivers/r8169-orig.c | 567 +++++++++++++++++++++++++++++++++++++++++ drivers/r8169.c | 312 +++++++++++++++++++++++ include/drivers/r8169.h | 235 +++++++++++++++++ include/vfio_api.h | 11 + patches/vf-netmdev.patch | 642 +++++++++++++++++++++++++++++++++++++++++++++++ run.sh | 52 ++++ 13 files changed, 2076 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README create mode 100644 TODO create mode 100644 aa create mode 100644 api/vfio_api.c create mode 100755 drivers/a.out create mode 100644 drivers/r8169-orig.c create mode 100644 drivers/r8169.c create mode 100644 include/drivers/r8169.h create mode 100644 include/vfio_api.h create mode 100644 patches/vf-netmdev.patch create mode 100755 run.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6802ab9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.o +.tmp_versions/ +tags +r8169 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..118be80 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +CC=gcc +CFLAGS=-Iinclude/ -Wall -Werror -Wunused + +all: drivers/r8169.o api/vfio_api.o + #$(CC) drivers/r8169.c -o r8169 $(CFLAGS) + $(CC) drivers/r8169.o api/vfio_api.o -o r8169 $(CFLAGS) +clean: + rm -f r8169 && rm `find -name *.o` -f diff --git a/README b/README new file mode 100644 index 0000000..8893e83 --- /dev/null +++ b/README @@ -0,0 +1,3 @@ +- apply kernel patch and recompile, should support every upstream kernel > + 4.10.x +- diff --git a/TODO b/TODO new file mode 100644 index 0000000..400e5cf --- /dev/null +++ b/TODO @@ -0,0 +1,4 @@ +- Measure streaming vs uncached performance? +- Prefer cached memory for streaming DMA to userspace? https://aelseb.wordpress.com/2015/04/11/contiguous-memory-on-arm-and-cache-coherency/ +- Check IOMMU existence and use vmalloc instead of kmalloc for dma_map_*()? +- Invalidate caches? dma_sync_single_*() not used. diff --git a/aa b/aa new file mode 100644 index 0000000..06f6ba6 --- /dev/null +++ b/aa @@ -0,0 +1,7 @@ +# ff +sudo sh -c "echo ad28d022-ae90-11e7-b712-2bdaf6e1af1c > /sys/class/net/enp4s0/device/mdev_supported_types/r8169-vfnetdev/create" +sudo sh -c "echo enp4s0 > /sys/bus/mdev/devices/ad28d022-ae90-11e7-b712-2bdaf6e1af1c/vfnetdev/netdev" + +#apalos +sudo sh -c "echo 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 > /sys/class/net/enp4s0/mdev_supported_types/net-vfnetdev/create" + diff --git a/api/vfio_api.c b/api/vfio_api.c new file mode 100644 index 0000000..f6674fd --- /dev/null +++ b/api/vfio_api.c @@ -0,0 +1,231 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +static const char *vfio_fail_str[] = { + [VFIO_CHECK_EXTENSION] = "Doesn't support the IOMMU driver we want", + [VFIO_GROUP_GET_STATUS] = "Can't get status", + [VFIO_GROUP_SET_CONTAINER] = "Failed to set container", + [VFIO_SET_IOMMU] "Failed to set IOMMU", + [VFIO_IOMMU_GET_INFO] = "Failed to get IOMMU info", + [VFIO_GROUP_GET_DEVICE_FD] = "Failed to get device FD", + [VFIO_DEVICE_GET_INFO] = "Failed to get device info", + [VFIO_DEVICE_GET_REGION_INFO] = "Failed to get PCI region info", +}; + +static void vfio_print_fail(int reason) +{ + if (reason > ARRAY_SIZE(vfio_fail_str)) + printf("Unknown\n"); + else + printf("%s\n", vfio_fail_str[reason]); +} + +/* + * returns a valid container + * fd must be close by caller + */ +int get_container(void) +{ + int ret; + int container; + /* Create a new container */ + container = open("/dev/vfio/vfio", O_RDWR); + + if (container < 0) + return container; + + ret = ioctl(container, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + printf("Unknown API version\n"); + goto out; + } + + if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + printf("Doesn't support the IOMMU driver we want\n"); + goto out; + } + + return container; +out: + close(container); + container = -1; + return ret; + +} + +/* + * returns a valid group + * fd must be close by caller + */ +int get_group(int grp_id) +{ + char path[64]; + int ret; + int group; + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + + snprintf(path, sizeof(path), "/dev/vfio/%d", grp_id); + group = open(path, O_RDWR); + if (group < 0) { + printf("Failed to open %s, %d (%s)\n", + path, group, strerror(errno)); + return group; + } + + ret = ioctl(group, VFIO_GROUP_GET_STATUS, &group_status); + + if (ret) { + printf("ioctl(VFIO_GROUP_GET_STATUS) failed\n"); + goto out; + } + + /* Test the group is viable and available */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + printf("Group is not viable\n"); + goto out; + } + + return group; +out: + close(group); + group = -1; + return ret; +} + +/* + * @fd: container fd + * @sz: requested size + * @vaddr: virtual address + */ +int dma_map_type1(int fd, unsigned long sz, void **vaddr, uint64_t iova) +{ + int ret; + struct vfio_iommu_type1_dma_map dma_map; + + /* Allocate some space and setup a DMA mapping */ + *vaddr = mmap(NULL, (size_t)sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (*vaddr == MAP_FAILED) { + printf("Failed to map memory\n"); + return -ENOMEM; + } + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(dma_map); + dma_map.vaddr = (unsigned long)*vaddr; + dma_map.size = sz; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) + printf("Failed to map DMA memory (%s)\n", strerror(errno)); + + return ret; +} + +int dma_unmap_type1(int fd, unsigned long sz, void *vaddr, uint64_t iova) +{ + int ret; + struct vfio_iommu_type1_dma_unmap dma_unmap; + + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(dma_unmap); + dma_unmap.size = sz; + dma_unmap.iova = iova; + ret = ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap); + if (ret) + printf("Failed to unmap DMA memory (%s)\n", strerror(errno)); + + ret = munmap(vaddr, (size_t)sz); + if (vaddr == MAP_FAILED) { + printf("Failed to unmap memory\n"); + return -ENOMEM; + } + + return ret; +} + +int vfio_init_dev(int grp, int container, struct vfio_group_status *grp_status, + struct vfio_iommu_type1_info *iommu_info, + struct vfio_device_info *dev_info, + struct vfio_region_info *reg_info, char *grp_uuid) +{ + int ret; + int device; + + ret = ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); + if (!ret) { + vfio_print_fail(VFIO_CHECK_EXTENSION); + goto out; + } + + /* Test the group is viable and available */ + ret = ioctl(grp, VFIO_GROUP_GET_STATUS, grp_status); + if (ret || !(grp_status->flags & VFIO_GROUP_FLAGS_VIABLE)) { + vfio_print_fail(VFIO_GROUP_GET_STATUS); + goto out; + + } + + ret = ioctl(grp, VFIO_GROUP_SET_CONTAINER, &container); + if (ret) { + vfio_print_fail(VFIO_GROUP_SET_CONTAINER); + printf("Failed to set group container\n"); + goto out; + } + + ret = ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); + if (ret) { + vfio_print_fail(VFIO_SET_IOMMU); + goto out; + } + + ret = ioctl(container, VFIO_IOMMU_GET_INFO, iommu_info); + if (ret) { + vfio_print_fail(VFIO_IOMMU_GET_INFO); + goto out; + } + + printf("iova_pgsizes bitmask=0x%llx\n", iommu_info->iova_pgsizes); + /* Get a file descriptor for the device */ + device = ioctl(grp, VFIO_GROUP_GET_DEVICE_FD, grp_uuid); + printf("device=%d\n", device); + if (device < 0) { + vfio_print_fail(VFIO_GROUP_GET_DEVICE_FD); + goto out; + } + + /* Test and setup the device */ + ret = ioctl(device, VFIO_DEVICE_GET_INFO, dev_info); + if (ret) { + vfio_print_fail(VFIO_DEVICE_GET_INFO); + goto out; + } + + printf("regions=%d irqs=%d\n", dev_info->num_regions, dev_info->num_irqs); + + /* Test and setup the device */ + reg_info->index = VFIO_PCI_NUM_REGIONS + 1; + ret = ioctl(device, VFIO_DEVICE_GET_REGION_INFO, reg_info); + if (ret) { + vfio_print_fail(VFIO_DEVICE_GET_REGION_INFO); + goto out; + } + + //if (!reg_info->size) { + //printf("Region:%d unimplemented PCI BAR\n", i); + //goto out; + //} + +out: + return device; +} diff --git a/drivers/a.out b/drivers/a.out new file mode 100755 index 0000000..24991fb Binary files /dev/null and b/drivers/a.out differ diff --git a/drivers/r8169-orig.c b/drivers/r8169-orig.c new file mode 100644 index 0000000..0aa0b7f --- /dev/null +++ b/drivers/r8169-orig.c @@ -0,0 +1,567 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define COMPILER_BARRIER() asm volatile("" ::: "memory") +#define MEMORY_BARRIER() asm volatile ("mfence" ::: "memory") +#define STORE_BARRIER() asm volatile ("sfence" ::: "memory") +#define LOAD_BARRIER() asm volatile ("lfence" ::: "memory") +#define dma_wmb() STORE_BARRIER() +#define dma_rmb() LOAD_BARRIER() +#define unlikely(x) (x) + +#define cpu_to_le32(x) htole32(x) +#define cpu_to_le64(x) htole64(x) +#define le32_to_cpu(x) le32toh(x) + +typedef unsigned long long u64; +typedef unsigned short u16; + + + + +enum rtl_register_content { + /* InterruptStatusBits */ + SYSErr = 0x8000, + PCSTimeout = 0x4000, + SWInt = 0x0100, + TxDescUnavail = 0x0080, + RxFIFOOver = 0x0040, + LinkChg = 0x0020, + RxOverflow = 0x0010, + TxErr = 0x0008, + TxOK = 0x0004, + RxErr = 0x0002, + RxOK = 0x0001, + + /* RxStatusDesc */ + RxBOVF = (1 << 24), + RxFOVF = (1 << 23), + RxRWT = (1 << 22), + RxRES = (1 << 21), + RxRUNT = (1 << 20), + RxCRC = (1 << 19), + + /* ChipCmdBits */ + StopReq = 0x80, + CmdReset = 0x10, + CmdRxEnb = 0x08, + CmdTxEnb = 0x04, + RxBufEmpty = 0x01, + + /* TXPoll register p.5 */ + HPQ = 0x80, /* Poll cmd on the high prio queue */ + NPQ = 0x40, /* Poll cmd on the low prio queue */ + FSWInt = 0x01, /* Forced software interrupt */ + + /* Cfg9346Bits */ + Cfg9346_Lock = 0x00, + Cfg9346_Unlock = 0xc0, + + /* rx_mode_bits */ + AcceptErr = 0x20, + AcceptRunt = 0x10, + AcceptBroadcast = 0x08, + AcceptMulticast = 0x04, + AcceptMyPhys = 0x02, + AcceptAllPhys = 0x01, +#define RX_CONFIG_ACCEPT_MASK 0x3f + + /* TxConfigBits */ + TxInterFrameGapShift = 24, + TxDMAShift = 8, /* DMA burst value (0-7) is shift this many bits */ + + /* Config1 register p.24 */ + LEDS1 = (1 << 7), + LEDS0 = (1 << 6), + Speed_down = (1 << 4), + MEMMAP = (1 << 3), + IOMAP = (1 << 2), + VPD = (1 << 1), + PMEnable = (1 << 0), /* Power Management Enable */ + + /* Config2 register p. 25 */ + ClkReqEn = (1 << 7), /* Clock Request Enable */ + MSIEnable = (1 << 5), /* 8169 only. Reserved in the 8168. */ + PCI_Clock_66MHz = 0x01, + PCI_Clock_33MHz = 0x00, + + /* Config3 register p.25 */ + MagicPacket = (1 << 5), /* Wake up when receives a Magic Packet */ + LinkUp = (1 << 4), /* Wake up when the cable connection is re-established */ + Jumbo_En0 = (1 << 2), /* 8168 only. Reserved in the 8168b */ + Rdy_to_L23 = (1 << 1), /* L23 Enable */ + Beacon_en = (1 << 0), /* 8168 only. Reserved in the 8168b */ + + /* Config4 register */ + Jumbo_En1 = (1 << 1), /* 8168 only. Reserved in the 8168b */ + + /* Config5 register p.27 */ + BWF = (1 << 6), /* Accept Broadcast wakeup frame */ + MWF = (1 << 5), /* Accept Multicast wakeup frame */ + UWF = (1 << 4), /* Accept Unicast wakeup frame */ + Spi_en = (1 << 3), + LanWake = (1 << 1), /* LanWake enable/disable */ + PMEStatus = (1 << 0), /* PME status can be reset by PCI RST# */ + ASPM_en = (1 << 0), /* ASPM enable */ + + /* TBICSR p.28 */ + TBIReset = 0x80000000, + TBILoopback = 0x40000000, + TBINwEnable = 0x20000000, + TBINwRestart = 0x10000000, + TBILinkOk = 0x02000000, + TBINwComplete = 0x01000000, + + /* CPlusCmd p.31 */ + EnableBist = (1 << 15), // 8168 8101 + Mac_dbgo_oe = (1 << 14), // 8168 8101 + Normal_mode = (1 << 13), // unused + Force_half_dup = (1 << 12), // 8168 8101 + Force_rxflow_en = (1 << 11), // 8168 8101 + Force_txflow_en = (1 << 10), // 8168 8101 + Cxpl_dbg_sel = (1 << 9), // 8168 8101 + ASF = (1 << 8), // 8168 8101 + PktCntrDisable = (1 << 7), // 8168 8101 + Mac_dbgo_sel = 0x001c, // 8168 + RxVlan = (1 << 6), + RxChkSum = (1 << 5), + PCIDAC = (1 << 4), + PCIMulRW = (1 << 3), + INTT_0 = 0x0000, // 8168 + INTT_1 = 0x0001, // 8168 + INTT_2 = 0x0002, // 8168 + INTT_3 = 0x0003, // 8168 + + /* rtl8169_PHYstatus */ + TBI_Enable = 0x80, + TxFlowCtrl = 0x40, + RxFlowCtrl = 0x20, + _1000bpsF = 0x10, + _100bps = 0x08, + _10bps = 0x04, + LinkStatus = 0x02, + FullDup = 0x01, + + /* _TBICSRBit */ + TBILinkOK = 0x02000000, + + /* ResetCounterCommand */ + CounterReset = 0x1, + + /* DumpCounterCommand */ + CounterDump = 0x8, + + /* magic enable v2 */ + MagicPacket_v2 = (1 << 16), /* Wake up when receives a Magic Packet */ +}; + +enum rtl_desc_bit { + /* First doubleword. */ + DescOwn = (1 << 31), /* Descriptor is owned by NIC */ + RingEnd = (1 << 30), /* End of descriptor ring */ + FirstFrag = (1 << 29), /* First segment of a packet */ + LastFrag = (1 << 28), /* Final segment of a packet */ +}; + +/* Generic case. */ +enum rtl_tx_desc_bit { + /* First doubleword. */ + TD_LSO = (1 << 27), /* Large Send Offload */ +#define TD_MSS_MAX 0x07ffu /* MSS value */ + + /* Second doubleword. */ + TxVlanTag = (1 << 17), /* Add VLAN tag */ +}; + +/* 8169, 8168b and 810x except 8102e. */ +enum rtl_tx_desc_bit_0 { + /* First doubleword. */ +#define TD0_MSS_SHIFT 16 /* MSS position (11 bits) */ + TD0_TCP_CS = (1 << 16), /* Calculate TCP/IP checksum */ + TD0_UDP_CS = (1 << 17), /* Calculate UDP/IP checksum */ + TD0_IP_CS = (1 << 18), /* Calculate IP checksum */ +}; + +/* 8102e, 8168c and beyond. */ +enum rtl_tx_desc_bit_1 { + /* First doubleword. */ + TD1_GTSENV4 = (1 << 26), /* Giant Send for IPv4 */ + TD1_GTSENV6 = (1 << 25), /* Giant Send for IPv6 */ +#define GTTCPHO_SHIFT 18 +#define GTTCPHO_MAX 0x7fU + + /* Second doubleword. */ +#define TCPHO_SHIFT 18 +#define TCPHO_MAX 0x3ffU +#define TD1_MSS_SHIFT 18 /* MSS position (11 bits) */ + TD1_IPv6_CS = (1 << 28), /* Calculate IPv6 checksum */ + TD1_IPv4_CS = (1 << 29), /* Calculate IPv4 checksum */ + TD1_TCP_CS = (1 << 30), /* Calculate TCP/IP checksum */ + TD1_UDP_CS = (1 << 31), /* Calculate UDP/IP checksum */ +}; + +enum rtl_rx_desc_bit { + /* Rx private */ + PID1 = (1 << 18), /* Protocol ID bit 1/2 */ + PID0 = (1 << 17), /* Protocol ID bit 0/2 */ + +#define RxProtoUDP (PID1) +#define RxProtoTCP (PID0) +#define RxProtoIP (PID1 | PID0) +#define RxProtoMask RxProtoIP + + IPFail = (1 << 16), /* IP checksum failed */ + UDPFail = (1 << 15), /* UDP/IP checksum failed */ + TCPFail = (1 << 14), /* TCP/IP checksum failed */ + RxVlanTag = (1 << 16), /* VLAN tag available */ +}; + +#define RsvdMask 0x3fffc000 + +struct TxDesc { + __le32 opts1; + __le32 opts2; + __le64 addr; +}; + +struct RxDesc { + __le32 opts1; + __le32 opts2; + __le64 addr; +}; + +char* buffers[256]; +typedef unsigned long dma_addr_t; +typedef unsigned int u32; + +int print_packet(unsigned char* buffer) +{ + int i; + //unsigned int* b = (unsigned int*)buffer; + printf("%02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x [%04x]: ", + buffer[6], buffer[7], buffer[8], buffer[9], buffer[10], buffer[11], + buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], buffer[5], + be16toh(*((u16*)(&buffer[12]))) + ); + for (i = 14; i < 32; i++) { + printf("%02x", buffer[i]); + } +} + + +typedef struct iomem { + u64 vaddr; + u64 iova; + u64 size; +} iomem; + +#define IOMEM_CHUNKS 4096 +iomem iomemArray[IOMEM_CHUNKS]; +int iomem_count; +u64 iomem_base; +u64 iomem_current; + +int iomem_init(void) +{ + void* tmp; + iomem_count = 0; + memset(iomemArray, 0, sizeof(iomemArray)); + iomem_base = 1 * 1024ULL * 1024ULL * 1024ULL * 1024ULL; + + iomem_current = iomem_base; + + /* reserve a 4GB contiguous address space and position it, if possible at 8GB */ + /* no pages are actually allocated and mapped into this address space */ + /* it is just making sure that overtime, we'll have 4GB contiguous */ + tmp = mmap( + (void*)iomem_base, + 4 * 1024ULL * 1024ULL * 1024ULL, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE, + -1, 0 + ); + if (tmp == NULL) { + printf("Could not reserve a contiguous 4GB address space\n"); + return -1; + } + iomem_base = (u64)tmp; + iomem_current = iomem_base; +} + +struct iomem* iomem_alloc(int device, unsigned int size) +{ + void* tmp; + int ret; + u64 location; + struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) }; + + if (size >= 32 * 1024 * 1024) return NULL; + if ((size & 0xFFF) != 0) return NULL; /* size should be a 4K aligned quantity */ + if (iomem_count >= IOMEM_CHUNKS) return NULL; + + /* get a portion of the 4GB window created at init time */ + tmp = mmap( + (void*)iomem_current, + size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, + -1, 0 + ); + if (tmp == NULL) return NULL; + if (iomem_base == 0) { + iomem_base = (u64)tmp; + iomem_current = iomem_base + size; + location = iomem_base; + } + else { + location = iomem_current; + iomem_current += size; + } + + iomemArray[iomem_count].vaddr = location; + iomemArray[iomem_count].size = size; + + dma_map.vaddr = iomemArray[iomem_count].vaddr; + dma_map.size = iomemArray[iomem_count].size; + + ret = ioctl(device, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret != 0) return NULL; + /* the kernel has filled dma_map.iova with the corresponding allocated IOVA */ + iomemArray[iomem_count].iova = dma_map.iova; + + printf("iomem_alloc: VA(%llx) -> physmem(%dKB) <- IOVA(%llx)\n", + iomemArray[iomem_count].vaddr, size/1024, iomemArray[iomem_count].iova + ); + + return &iomemArray[iomem_count++]; +} + + +static inline void rtl8169_mark_to_asic(struct RxDesc *desc, u32 rx_buf_sz) +{ + u32 eor = le32_to_cpu(desc->opts1) & RingEnd; + + /* Force memory writes to complete before releasing descriptor */ + dma_wmb(); + + desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz); +} + +static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping, + u32 rx_buf_sz) +{ + desc->addr = cpu_to_le64(mapping); + rtl8169_mark_to_asic(desc, rx_buf_sz); +} + +char* rxBuffers[256]; + +static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc) +{ + desc->opts1 |= cpu_to_le32(RingEnd); +} + +int rtl8169_rx_fill(int device, struct RxDesc* rxRing) +{ + int i; + struct iomem* packetArea; + + packetArea = iomem_alloc(device, 2 * 1024 * 1024); + if (packetArea == NULL) return -1; + + for (i = 0; i < 256; i++) + { + rtl8169_map_to_asic(&rxRing[i], packetArea->iova + i * 2048, 2048); + rxBuffers[i] = (char*)(packetArea->vaddr + i * 2048); + } + rtl8169_mark_as_last_descriptor(&rxRing[255]); + return 0; +} + +int main(int argc, char* argv[]) +{ + int container, group, parent, device, i; + + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + struct vfio_region_info region_info = { .argsz = sizeof(region_info) }; + struct RxDesc* rxRing; + + iomem_init(); + + /* Create a new container */ + container = open("/dev/vfio/vfio", O_RDWR); + + if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { + printf("Unknown API version\n"); + return -1; + } + + if (!ioctl(container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + printf("Doesn't support the IOMMU driver we want\n"); + return -1; + } + + group = open(argv[1], O_RDWR); + /* Test the group is viable and available */ + ioctl(group, VFIO_GROUP_GET_STATUS, &group_status); + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + printf(" Group is not viable\n"); + return -1; + } + + ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); + + ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); + + ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info); + + printf("iova_pgsizes bitmask=0x%llx\n", iommu_info.iova_pgsizes); + /* Get a file descriptor for the device */ + printf("AAAA %s\n", argv[2]); + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, argv[2]); + + printf("device=%d\n", device); + if (device <= 0) return -3; + + /* Test and setup the device */ + ioctl(device, VFIO_DEVICE_GET_INFO, &device_info); + + printf("regions=%d irqs=%d\n", device_info.num_regions, device_info.num_irqs); + + /* Test and setup the device */ + ioctl(device, VFIO_DEVICE_GET_INFO, &device_info); + + region_info.index = VFIO_PCI_NUM_REGIONS + 1; + if (ioctl(device, VFIO_DEVICE_GET_REGION_INFO, ®ion_info)) + return -1; + if (!region_info.size) { + //printf("Region:%d unimplemented PCI BAR\n", i); + return -2; + } + + printf("Region:%d size %llu, offset 0x%llx, flags 0x%x\n", i, + region_info.size, + region_info.offset, region_info.flags); + + rxRing = mmap(NULL, region_info.size, PROT_READ | PROT_WRITE, + MAP_SHARED, device, region_info.offset); + + if (rtl8169_rx_fill(device, rxRing) != 0) { + printf("Could not fill ring\n"); + return -1; + } + + /* signal ready */ + ioctl(device, 500, NULL); + + i = 0; + while (1) + { + if (i >= 256) i = 0; + for (; i < 256; i++) + { + u32 status; + + status = le32_to_cpu(rxRing[i].opts1) & ~0; /// either ~(RxBOVF | RxFOVF) or ~0; + + if (status & DescOwn) { + usleep(100*1000); + break; + } + + /* This barrier is needed to keep us from reading + * any other fields out of the Rx descriptor until + * we know the status of DescOwn + */ + dma_rmb(); + + if (unlikely(status & RxRES)) { + printf("Rx ERROR. status = %08x\n",status); + /* + dev->stats.rx_errors++; + if (status & (RxRWT | RxRUNT)) + dev->stats.rx_length_errors++; + if (status & RxCRC) + dev->stats.rx_crc_errors++; + if (status & RxFOVF) { + rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING); + dev->stats.rx_fifo_errors++; + } + */ + if ((status & (RxRUNT | RxCRC)) && + !(status & (RxRWT | RxFOVF)) + /* && (dev->features & NETIF_F_RXALL) */ + ) + goto process_pkt; + } + else { + //dma_addr_t addr; + int pkt_size; + + process_pkt: + //addr = le64_to_cpu(rxRing[i].addr); + if (1) // likely(!(dev->features & NETIF_F_RXFCS))) + pkt_size = (status & 0x00003fff) - 4; + else + pkt_size = status & 0x00003fff; + + /* + * The driver does not support incoming fragmented + * frames. They are seen as a symptom of over-mtu + * sized frames. + */ + /* + if (unlikely(rtl8169_fragmented_frame(status))) { + dev->stats.rx_dropped++; + dev->stats.rx_length_errors++; + goto release_descriptor; + } + + skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry], + tp, pkt_size, addr); + if (!skb) { + dev->stats.rx_dropped++; + goto release_descriptor; + } + + rtl8169_rx_csum(skb, status); + skb_put(skb, pkt_size); + skb->protocol = eth_type_trans(skb, dev); + + rtl8169_rx_vlan_tag(desc, skb); + + if (skb->pkt_type == PACKET_MULTICAST) + dev->stats.multicast++; + + napi_gro_receive(&tp->napi, skb); + + u64_stats_update_begin(&tp->rx_stats.syncp); + tp->rx_stats.packets++; + tp->rx_stats.bytes += pkt_size; + u64_stats_update_end(&tp->rx_stats.syncp); + */ + printf("desc[%03d]: size=%5d ", i, pkt_size); + print_packet(rxBuffers[i]); + printf("\n"); + } + release_descriptor: + rxRing[i].opts2 = 0; + rtl8169_mark_to_asic(&rxRing[i], 2048); + } + + } + + +} diff --git a/drivers/r8169.c b/drivers/r8169.c new file mode 100644 index 0000000..64b1d3a --- /dev/null +++ b/drivers/r8169.c @@ -0,0 +1,312 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +/* Us */ +#include +#include + +typedef unsigned long dma_addr_t; +char *rxBuffers[256]; + +static void print_packet(unsigned char *buffer) +{ + int i; + //unsigned int* b = (unsigned int*)buffer; + printf("%02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x [%04x]:", + buffer[6], buffer[7], buffer[8], buffer[9], buffer[10], buffer[11], + buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], buffer[5], + be16toh(*((__u16*)(&buffer[12]))) + ); + + for (i = 14; i < 32; i++) { + printf("%02x", buffer[i]); + } +} + +#define IOMEM_CHUNKS 4096 +iomem iomem_array[IOMEM_CHUNKS]; +int iomem_count; +__u64 iomem_base; +__u64 iomem_current; + +int iomem_init(void) +{ + void *tmp; + iomem_count = 0; + memset(iomem_array, 0, sizeof(iomem_array)); + iomem_base = 1 * 1024ULL * 1024ULL * 1024ULL * 1024ULL; + + iomem_current = iomem_base; + /* reserve a 4GB contiguous address space and position it, if possible at 8GB */ + /* no pages are actually allocated and mapped into this address space */ + /* it is just making sure that overtime, we'll have 4GB contiguous */ + tmp = mmap((void*)iomem_base, 4 * 1024ULL * 1024ULL * 1024ULL, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | + MAP_NORESERVE, -1, 0); + if (tmp == MAP_FAILED) { + printf("Could not reserve a contiguous 4GB address space\n"); + return -1; + } + iomem_base = (__u64)tmp; + iomem_current = iomem_base; + + return 0; +} + +struct iomem *iomem_alloc(int device, unsigned int size) +{ + void *tmp; + int ret; + __u64 location; + struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) }; + + if (size >= 32 * 1024 * 1024) + return NULL; + if ((size & 0xFFF) != 0) + return NULL; /* size should be a 4K aligned quantity */ + if (iomem_count >= IOMEM_CHUNKS) + return NULL; + + /* get a portion of the 4GB window created at init time */ + tmp = mmap((void*)iomem_current, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, -1, + 0); + if (tmp == MAP_FAILED) + return NULL; + if (iomem_base == 0) { + iomem_base = (__u64)tmp; + iomem_current = iomem_base + size; + location = iomem_base; + } else { + location = iomem_current; + iomem_current += size; + } + + iomem_array[iomem_count].vaddr = location; + iomem_array[iomem_count].size = size; + + dma_map.vaddr = iomem_array[iomem_count].vaddr; + dma_map.size = iomem_array[iomem_count].size; + + /* kernel driver fills dma_map.iova with the proper allocated IOVA */ + ret = ioctl(device, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret != 0) + return NULL; + iomem_array[iomem_count].iova = dma_map.iova; + + printf("iomem_alloc: VA(%llx) -> physmem(%dKB) <- IOVA(%llx)\n", + iomem_array[iomem_count].vaddr, size/1024, + iomem_array[iomem_count].iova); + + return &iomem_array[iomem_count++]; +} + +static inline void rtl8169_mark_to_asic(struct RxDesc *desc, __u32 rx_buf_sz) +{ + __u32 eor = le32_to_cpu(desc->opts1) & RingEnd; + + /* Force memory writes to complete before releasing descriptor */ + dma_wmb(); + + desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz); +} + +static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping, + __u32 rx_buf_sz) +{ + desc->addr = cpu_to_le64(mapping); + rtl8169_mark_to_asic(desc, rx_buf_sz); +} + + +static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc) +{ + desc->opts1 |= cpu_to_le32(RingEnd); +} + +int rtl8169_rx_fill(int device, struct RxDesc* rxRing) +{ + int i; + struct iomem *packetArea; + + packetArea = iomem_alloc(device, 2 * 1024 * 1024); + if (packetArea == NULL) + return -1; + + for (i = 0; i < NUM_RX_DESC; i++) { + rtl8169_map_to_asic(&rxRing[i], packetArea->iova + i * 2048, + 2048); + rxBuffers[i] = (char*)(packetArea->vaddr + i * 2048); + } + rtl8169_mark_as_last_descriptor(&rxRing[255]); + + return 0; +} + +void usage(char *name) +{ + printf("usage: %s \n", name); +} + +int main(int argc, char* argv[]) +{ + int container, group, device, i = 0; + int group_id; + char group_uuid[128]; + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + struct vfio_region_info region_info = { .argsz = sizeof(region_info) }; + struct RxDesc* rxRing; + + if (argc != 3) { + usage(argv[0]); + return -1; + } + iomem_init(); + + /* Create a new container */ + container = get_container(); + + if (container < 0) + goto out; + + group_id = atoi(argv[1]); + group = get_group(group_id); + if (group < 0) + goto out; + + strncpy(group_uuid, argv[2], sizeof(group_uuid)); + device = vfio_init_dev(group, container, &group_status, &iommu_info, + &device_info, ®ion_info, group_uuid); + + printf("Region:%d size %llu, offset 0x%llx, flags 0x%x\n", i, + region_info.size, region_info.offset, region_info.flags); + + rxRing = mmap(NULL, region_info.size, PROT_READ | PROT_WRITE, + MAP_SHARED, device, region_info.offset); + if (rxRing == MAP_FAILED) { + printf("Could not reserve on contiguous 4GB address space\n"); + return -1; + } + + if (rtl8169_rx_fill(device, rxRing) != 0) { + printf("Could not fill ring\n"); + return -1; + } + + /* signal ready */ + ioctl(device, 500, NULL); + i = 0; + while (1) + { + if (i >= NUM_RX_DESC) + i = 0; + for (; i < NUM_RX_DESC; i++) + { + __u32 status; + + status = le32_to_cpu(rxRing[i].opts1) & ~0; /// either ~(RxBOVF | RxFOVF) or ~0; + + if (status & DescOwn) { + usleep(100*1000); + break; + } + + /* This barrier is needed to keep us from reading + * any other fields out of the Rx descriptor until + * we know the status of DescOwn + */ + dma_rmb(); + + if (unlikely(status & RxRES)) { + printf("Rx ERROR. status = %08x\n",status); + /* + dev->stats.rx_errors++; + if (status & (RxRWT | RxRUNT)) + dev->stats.rx_length_errors++; + if (status & RxCRC) + dev->stats.rx_crc_errors++; + if (status & RxFOVF) { + rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING); + dev->stats.rx_fifo_errors++; + } + */ + if ((status & (RxRUNT | RxCRC)) && + !(status & (RxRWT | RxFOVF)) + /* && (dev->features & NETIF_F_RXALL) */ + ) + goto process_pkt; + } + else { + //dma_addr_t addr; + int pkt_size; + process_pkt: + //addr = le64_to_cpu(rxRing[i].addr); + if (1) // likely(!(dev->features & NETIF_F_RXFCS))) + pkt_size = (status & 0x00003fff) - 4; + else + pkt_size = status & 0x00003fff; + + /* + * The driver does not support incoming fragmented + * frames. They are seen as a symptom of over-mtu + * sized frames. + */ + /* + if (unlikely(rtl8169_fragmented_frame(status))) { + dev->stats.rx_dropped++; + dev->stats.rx_length_errors++; + goto release_descriptor; + } + + skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry], + tp, pkt_size, addr); + if (!skb) { + dev->stats.rx_dropped++; + goto release_descriptor; + } + + rtl8169_rx_csum(skb, status); + skb_put(skb, pkt_size); + skb->protocol = eth_type_trans(skb, dev); + + rtl8169_rx_vlan_tag(desc, skb); + + if (skb->pkt_type == PACKET_MULTICAST) + dev->stats.multicast++; + + napi_gro_receive(&tp->napi, skb); + + __u64_stats_update_begin(&tp->rx_stats.syncp); + tp->rx_stats.packets++; + tp->rx_stats.bytes += pkt_size; + __u64_stats_update_end(&tp->rx_stats.syncp); + */ + printf("desc[%03d]: size= %5d ", i, pkt_size); + print_packet((unsigned char *)rxBuffers[i]); + printf("\n"); + } + /*release_descriptor: */ + rxRing[i].opts2 = 0; + rtl8169_mark_to_asic(&rxRing[i], 2048); + } + + } + +out: + close(group); + close(container); + return -1; + +} diff --git a/include/drivers/r8169.h b/include/drivers/r8169.h new file mode 100644 index 0000000..1b125f2 --- /dev/null +++ b/include/drivers/r8169.h @@ -0,0 +1,235 @@ +#ifndef _R8169_H_ +#define _R8169_H_ +#define COMPILER_BARRIER() asm volatile("" ::: "memory") +#define MEMORY_BARRIER() asm volatile ("mfence" ::: "memory") +#define STORE_BARRIER() asm volatile ("sfence" ::: "memory") +#define LOAD_BARRIER() asm volatile ("lfence" ::: "memory") +#define dma_wmb() STORE_BARRIER() +#define dma_rmb() LOAD_BARRIER() +#define unlikely(x) (x) + +#define cpu_to_le32(x) htole32(x) +#define cpu_to_le64(x) htole64(x) +#define le32_to_cpu(x) le32toh(x) + +#define NUM_TX_DESC 64 /* Number of Tx descriptor registers */ +#define NUM_RX_DESC 256U /* Number of Rx descriptor registers */ +#define R8169_TX_RING_BYTES (NUM_TX_DESC * sizeof(struct TxDesc)) +#define R8169_RX_RING_BYTES (NUM_RX_DESC * sizeof(struct RxDesc)) + +typedef struct iomem { + __u64 vaddr; + __u64 iova; + __u64 size; +} iomem; + +/* drivers/ethernet/realtek/r8169.c */ +enum rtl_register_content { + /* InterruptStatusBits */ + SYSErr = 0x8000, + PCSTimeout = 0x4000, + SWInt = 0x0100, + TxDescUnavail = 0x0080, + RxFIFOOver = 0x0040, + LinkChg = 0x0020, + RxOverflow = 0x0010, + TxErr = 0x0008, + TxOK = 0x0004, + RxErr = 0x0002, + RxOK = 0x0001, + + /* RxStatusDesc */ + RxBOVF = (1 << 24), + RxFOVF = (1 << 23), + RxRWT = (1 << 22), + RxRES = (1 << 21), + RxRUNT = (1 << 20), + RxCRC = (1 << 19), + + /* ChipCmdBits */ + StopReq = 0x80, + CmdReset = 0x10, + CmdRxEnb = 0x08, + CmdTxEnb = 0x04, + RxBufEmpty = 0x01, + + /* TXPoll register p.5 */ + HPQ = 0x80, /* Poll cmd on the high prio queue */ + NPQ = 0x40, /* Poll cmd on the low prio queue */ + FSWInt = 0x01, /* Forced software interrupt */ + + /* Cfg9346Bits */ + Cfg9346_Lock = 0x00, + Cfg9346_Unlock = 0xc0, + + /* rx_mode_bits */ + AcceptErr = 0x20, + AcceptRunt = 0x10, + AcceptBroadcast = 0x08, + AcceptMulticast = 0x04, + AcceptMyPhys = 0x02, + AcceptAllPhys = 0x01, +#define RX_CONFIG_ACCEPT_MASK 0x3f + + /* TxConfigBits */ + TxInterFrameGapShift = 24, + TxDMAShift = 8, /* DMA burst value (0-7) is shift this many bits */ + + /* Config1 register p.24 */ + LEDS1 = (1 << 7), + LEDS0 = (1 << 6), + Speed_down = (1 << 4), + MEMMAP = (1 << 3), + IOMAP = (1 << 2), + VPD = (1 << 1), + PMEnable = (1 << 0), /* Power Management Enable */ + + /* Config2 register p. 25 */ + ClkReqEn = (1 << 7), /* Clock Request Enable */ + MSIEnable = (1 << 5), /* 8169 only. Reserved in the 8168. */ + PCI_Clock_66MHz = 0x01, + PCI_Clock_33MHz = 0x00, + + /* Config3 register p.25 */ + MagicPacket = (1 << 5), /* Wake up when receives a Magic Packet */ + LinkUp = (1 << 4), /* Wake up when the cable connection is re-established */ + Jumbo_En0 = (1 << 2), /* 8168 only. Reserved in the 8168b */ + Rdy_to_L23 = (1 << 1), /* L23 Enable */ + Beacon_en = (1 << 0), /* 8168 only. Reserved in the 8168b */ + + /* Config4 register */ + Jumbo_En1 = (1 << 1), /* 8168 only. Reserved in the 8168b */ + + /* Config5 register p.27 */ + BWF = (1 << 6), /* Accept Broadcast wakeup frame */ + MWF = (1 << 5), /* Accept Multicast wakeup frame */ + UWF = (1 << 4), /* Accept Unicast wakeup frame */ + Spi_en = (1 << 3), + LanWake = (1 << 1), /* LanWake enable/disable */ + PMEStatus = (1 << 0), /* PME status can be reset by PCI RST# */ + ASPM_en = (1 << 0), /* ASPM enable */ + + /* TBICSR p.28 */ + TBIReset = 0x80000000, + TBILoopback = 0x40000000, + TBINwEnable = 0x20000000, + TBINwRestart = 0x10000000, + TBILinkOk = 0x02000000, + TBINwComplete = 0x01000000, + + /* CPlusCmd p.31 */ + EnableBist = (1 << 15), // 8168 8101 + Mac_dbgo_oe = (1 << 14), // 8168 8101 + Normal_mode = (1 << 13), // unused + Force_half_dup = (1 << 12), // 8168 8101 + Force_rxflow_en = (1 << 11), // 8168 8101 + Force_txflow_en = (1 << 10), // 8168 8101 + Cxpl_dbg_sel = (1 << 9), // 8168 8101 + ASF = (1 << 8), // 8168 8101 + PktCntrDisable = (1 << 7), // 8168 8101 + Mac_dbgo_sel = 0x001c, // 8168 + RxVlan = (1 << 6), + RxChkSum = (1 << 5), + PCIDAC = (1 << 4), + PCIMulRW = (1 << 3), + INTT_0 = 0x0000, // 8168 + INTT_1 = 0x0001, // 8168 + INTT_2 = 0x0002, // 8168 + INTT_3 = 0x0003, // 8168 + + /* rtl8169_PHYstatus */ + TBI_Enable = 0x80, + TxFlowCtrl = 0x40, + RxFlowCtrl = 0x20, + _1000bpsF = 0x10, + _100bps = 0x08, + _10bps = 0x04, + LinkStatus = 0x02, + FullDup = 0x01, + + /* _TBICSRBit */ + TBILinkOK = 0x02000000, + + /* ResetCounterCommand */ + CounterReset = 0x1, + + /* DumpCounterCommand */ + CounterDump = 0x8, + + /* magic enable v2 */ + MagicPacket_v2 = (1 << 16), /* Wake up when receives a Magic Packet */ +}; + +enum rtl_desc_bit { + /* First doubleword. */ + DescOwn = (1 << 31), /* Descriptor is owned by NIC */ + RingEnd = (1 << 30), /* End of descriptor ring */ + FirstFrag = (1 << 29), /* First segment of a packet */ + LastFrag = (1 << 28), /* Final segment of a packet */ +}; + +/* Generic case. */ +enum rtl_tx_desc_bit { + /* First doubleword. */ + TD_LSO = (1 << 27), /* Large Send Offload */ +#define TD_MSS_MAX 0x07ffu /* MSS value */ + + /* Second doubleword. */ + TxVlanTag = (1 << 17), /* Add VLAN tag */ +}; + +/* 8169, 8168b and 810x except 8102e. */ +enum rtl_tx_desc_bit_0 { + /* First doubleword. */ +#define TD0_MSS_SHIFT 16 /* MSS position (11 bits) */ + TD0_TCP_CS = (1 << 16), /* Calculate TCP/IP checksum */ + TD0_UDP_CS = (1 << 17), /* Calculate UDP/IP checksum */ + TD0_IP_CS = (1 << 18), /* Calculate IP checksum */ +}; + +/* 8102e, 8168c and beyond. */ +enum rtl_tx_desc_bit_1 { + /* First doubleword. */ + TD1_GTSENV4 = (1 << 26), /* Giant Send for IPv4 */ + TD1_GTSENV6 = (1 << 25), /* Giant Send for IPv6 */ +#define GTTCPHO_SHIFT 18 +#define GTTCPHO_MAX 0x7fU + + /* Second doubleword. */ +#define TCPHO_SHIFT 18 +#define TCPHO_MAX 0x3ffU +#define TD1_MSS_SHIFT 18 /* MSS position (11 bits) */ + TD1_IPv6_CS = (1 << 28), /* Calculate IPv6 checksum */ + TD1_IPv4_CS = (1 << 29), /* Calculate IPv4 checksum */ + TD1_TCP_CS = (1 << 30), /* Calculate TCP/IP checksum */ + TD1_UDP_CS = (1 << 31), /* Calculate UDP/IP checksum */ +}; + +enum rtl_rx_desc_bit { + /* Rx private */ + PID1 = (1 << 18), /* Protocol ID bit 1/2 */ + PID0 = (1 << 17), /* Protocol ID bit 0/2 */ +#define RxProtoUDP (PID1) +#define RxProtoTCP (PID0) +#define RxProtoIP (PID1 | PID0) +#define RxProtoMask RxProtoIP + IPFail = (1 << 16), /* IP checksum failed */ + UDPFail = (1 << 15), /* UDP/IP checksum failed */ + TCPFail = (1 << 14), /* TCP/IP checksum failed */ + RxVlanTag = (1 << 16), /* VLAN tag available */ +}; + +#define RsvdMask 0x3fffc000 + +struct TxDesc { + __le32 opts1; + __le32 opts2; + __le64 addr; +}; + +struct RxDesc { + __le32 opts1; + __le32 opts2; + __le64 addr; +}; +#endif diff --git a/include/vfio_api.h b/include/vfio_api.h new file mode 100644 index 0000000..7485e67 --- /dev/null +++ b/include/vfio_api.h @@ -0,0 +1,11 @@ +#ifndef VFIO_API_H +#define VFIO_API_H +int dma_map_type1(int fd, unsigned long sz, void **vaddr, uint64_t iova); +int dma_unmap_type1(int fd, unsigned long sz, void *vaddr, uint64_t iova); +int get_group(int grp_id); +int get_container(void); +int vfio_init_dev(int grp, int container, struct vfio_group_status *grp_status, + struct vfio_iommu_type1_info *iommu_info, + struct vfio_device_info *dev_info, + struct vfio_region_info *reg_info, char *group_uuid); +#endif diff --git a/patches/vf-netmdev.patch b/patches/vf-netmdev.patch new file mode 100644 index 0000000..6a2760b --- /dev/null +++ b/patches/vf-netmdev.patch @@ -0,0 +1,642 @@ +diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c +index e03fcf9..1d37643 100644 +--- a/drivers/net/ethernet/realtek/r8169.c ++++ b/drivers/net/ethernet/realtek/r8169.c +@@ -33,6 +33,13 @@ + #include + #include + ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++#include ++#include ++#include ++#include ++#endif ++ + #define RTL8169_VERSION "2.3LK-NAPI" + #define MODULENAME "r8169" + #define PFX MODULENAME ": " +@@ -7393,6 +7400,11 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget + unsigned int cur_rx, rx_left; + unsigned int count; + ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++ if (dev->priv_flags & IFF_VFNETDEV) ++ return budget ; ++#endif ++ + cur_rx = tp->cur_rx; + + for (rx_left = min(budget, NUM_RX_DESC); rx_left > 0; rx_left--, cur_rx++) { +@@ -7577,6 +7589,11 @@ static int rtl8169_poll(struct napi_struct *napi, int budget) + int work_done= 0; + u16 status; + ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++ if (dev->priv_flags & IFF_VFNETDEV) ++ return budget; ++#endif ++ + status = rtl_get_events(tp); + rtl_ack_events(tp, status & ~tp->event_slow); + +@@ -7700,11 +7717,19 @@ static int rtl_open(struct net_device *dev) + if (!tp->TxDescArray) + goto err_pm_runtime_put; + ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++ printk(KERN_INFO"TxDescArray @%p (%llx), virt_tophys=%llx\n", ++ tp->TxDescArray, tp->TxPhyAddr, virt_to_phys(tp->TxDescArray)); ++#endif + tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES, + &tp->RxPhyAddr, GFP_KERNEL); + if (!tp->RxDescArray) + goto err_free_tx_0; + ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++ printk(KERN_INFO"RxDescArray KVA(@%p) -> PA(%llx) <- IOVA(%llx)\n", ++ tp->RxDescArray, virt_to_phys(tp->RxDescArray), tp->RxPhyAddr); ++#endif + retval = rtl8169_init_ring(dev); + if (retval < 0) + goto err_free_rx_1; +@@ -8008,6 +8033,10 @@ static void rtl_remove_one(struct pci_dev *pdev) + struct net_device *dev = pci_get_drvdata(pdev); + struct rtl8169_private *tp = netdev_priv(dev); + ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++ mdev_unregister_device(&pdev->dev); ++#endif ++ + if ((tp->mac_version == RTL_GIGA_MAC_VER_27 || + tp->mac_version == RTL_GIGA_MAC_VER_28 || + tp->mac_version == RTL_GIGA_MAC_VER_31 || +@@ -8191,6 +8220,542 @@ static void rtl_hw_initialize(struct rtl8169_private *tp) + } + } + ++ ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++struct iovamap { ++ u64 iova; ++ void *vaddr; ++ struct device *dev; ++ u32 size:25; /* maximum of 32MB */ ++ u32 direction:2; /* DMA_FROM_DEVICE... */ ++}; ++ ++typedef struct netmdev { ++ union { ++ char page0[4096]; ++ struct { ++ struct net_device* netdev; ++ /* FIXME USE A LINKED LIST */ ++ int mappings_count; ++ struct iovamap mappings[128]; /* 3.5KB */ ++ }; ++ }; ++ union { ++ /* shadow features & statistics page */ ++ char page1[4096]; ++ struct { ++ netdev_features_t features; ++ netdev_features_t hw_features; ++ netdev_features_t wanted_features; ++ netdev_features_t vlan_features; ++ netdev_features_t hw_enc_features; ++ netdev_features_t mpls_features; ++ netdev_features_t gso_partial_features; ++ struct net_device_stats stats; ++ atomic_long_t rx_dropped; ++ atomic_long_t tx_dropped; ++ atomic_long_t rx_nohandler; ++ }; ++ }; ++} netmdev; ++ ++/* ++SYSFS structure for the controlling device ++*/ ++ ++static ssize_t available_instances_show(struct kobject *kobj, struct device *dev, ++ char *buf) ++{ ++ return scnprintf(buf, PAGE_SIZE, "%d\n", 1); ++} ++static MDEV_TYPE_ATTR_RO(available_instances); ++ ++static ssize_t device_api_show(struct kobject *kobj, struct device *dev, ++ char *buf) ++{ ++ return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); ++} ++static MDEV_TYPE_ATTR_RO(device_api); ++ ++static struct attribute *sysfs_vfnetdev_attributes[] = { ++ &mdev_type_attr_device_api.attr, ++ &mdev_type_attr_available_instances.attr, ++ NULL, ++}; ++ ++static struct attribute_group sysfs_vfnetdev_type = { ++ .name = "vfnetdev", ++ .attrs = sysfs_vfnetdev_attributes, ++}; ++ ++/* Only 1 supported for now */ ++static struct attribute_group *sysfs_type_list[] = { ++ &sysfs_vfnetdev_type, ++ NULL ++}; ++ ++/* ++ * libraries ++ */ ++static struct net_device *netmdev_get_netdev(struct mdev_device* mdev) ++{ ++ struct netmdev *netmdev; ++ ++ netmdev = mdev_get_drvdata(mdev); ++ if (!netmdev) ++ return NULL; ++ ++ return netmdev->netdev; ++} ++ ++static void r8169_pause_datapath(struct net_device* netdev) ++{ ++ //void __iomem *ioaddr; ++ struct rtl8169_private *tp; ++ ++ if (!netdev) ++ return; ++ tp = netdev_priv(netdev); ++ if (!tp) ++ return; ++ ++ //ioaddr = tp->mmio_addr; ++ RTL_W8(ChipCmd, RTL_R8(ChipCmd) & ~(CmdTxEnb | CmdRxEnb)); ++} ++ ++static void r8169_resume_datapath(struct net_device* netdev) ++{ ++ //void __iomem *ioaddr; ++ struct rtl8169_private *tp; ++ ++ if (!netdev) ++ return; ++ ++ tp = netdev_priv(netdev); ++ ++ if (!tp) ++ return; ++ ++ //ioaddr = tp->mmio_addr; ++ RTL_W8(ChipCmd, CmdTxEnb | CmdRxEnb); ++} ++ ++static int r8169_get_region(struct net_device* netdev, struct vfio_region_info* info) ++{ ++ struct rtl8169_private *tp; ++ ++ if (!netdev) ++ return -EINVAL; ++ ++ tp = netdev_priv(netdev); ++ if (!tp) ++ return -EFAULT; ++ ++ switch (info->index) { ++ case VFIO_PCI_NUM_REGIONS + 1: ++ case VFIO_PCI_NUM_REGIONS + 2: ++ { ++ if (info->index == VFIO_PCI_NUM_REGIONS + 1) { ++ info->offset = (__u64)(tp->RxDescArray); ++ info->size = R8169_RX_RING_BYTES; ++ } ++ else if (info->index == VFIO_PCI_NUM_REGIONS + 2) { ++ info->offset = (__u64)(tp->TxDescArray); ++ info->size = R8169_TX_RING_BYTES; ++ } ++ else return -EINVAL; ++ ++ info->flags = VFIO_REGION_INFO_FLAG_MMAP; ++ break; ++ } ++ ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++ ++} ++ ++/* ++ * SYSFS structure for created mdevices ++ */ ++static ssize_t netdev_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ struct mdev_device* mdev; ++ struct net_device* netdev; ++ ++ mdev = mdev_from_dev(dev); ++ if (!mdev) ++ return scnprintf(buf, PAGE_SIZE, "mdev not found\n"); ++ ++ netdev = netmdev_get_netdev(mdev); ++ if (!netdev) ++ return scnprintf(buf, PAGE_SIZE, "ndev-mdev not found\n"); ++ ++ return scnprintf(buf, PAGE_SIZE, "%.16s\n", netdev->name); ++} ++ ++static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct mdev_device *mdev; ++ struct net_device *port; ++ struct netmdev *netmdev; ++ char name[IFNAMSIZ+1]; ++ ++ if (count < 2) ++ return -EINVAL; ++ ++ mdev = mdev_from_dev(dev); ++ if (!mdev) ++ return -ENODEV; ++ ++ netmdev = mdev_get_drvdata(mdev); ++ if (netmdev) ++ return -ENODEV; ++ ++ netmdev = kzalloc(sizeof(*netmdev), GFP_KERNEL); ++ if (!netmdev) ++ return -ENOMEM; ++ mdev_set_drvdata(mdev, netmdev); ++ ++ if (count > IFNAMSIZ) ++ return -ENODEV; ++ ++ memset(name, 0, sizeof(name)); ++ scnprintf(name, IFNAMSIZ + 1, "%.*s", (int)count - 1, buf); ++ port = dev_get_by_name(&init_net, name); ++ if (!port) ++ return -ENODEV; ++ ++ /* FIXME find a way to check if this is the parent device */ ++ //if (&port->dev != mdev_parent_dev(mdev)) return -1; ++ ++ netmdev->netdev = port; ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(netdev); ++static struct attribute *sysfs_mdev_vfnetdev_attributes[] = { ++ &dev_attr_netdev.attr, ++ NULL, ++}; ++ ++static struct attribute_group sysfs_mdev_vfnetdev_group = { ++ .name = "vfnetdev", ++ .attrs = sysfs_mdev_vfnetdev_attributes, ++}; ++ ++static const struct attribute_group *sysfs_mdev_groups[] = { ++ &sysfs_mdev_vfnetdev_group, ++ NULL, ++}; ++ ++ ++static int vf_netdev_create(struct kobject *kobj, struct mdev_device *mdev) ++{ ++ return 0; ++} ++ ++static int vf_netdev_remove(struct mdev_device *mdev) ++{ ++ struct netmdev* netmdev = mdev_get_drvdata(mdev); ++ struct net_device* port; ++ ++ printk(KERN_INFO"%s %d\n", __func__, __LINE__); ++ port = netmdev_get_netdev(mdev); ++ dev_put(port); ++ kfree(netmdev); ++ mdev_set_drvdata(mdev, NULL); ++ ++ return 0; ++} ++ ++static int vf_netdev_open(struct mdev_device *mdev) ++{ ++ struct netmdev* netmdev = mdev_get_drvdata(mdev); ++ struct net_device* port; ++ ++ printk(KERN_INFO"%s %d\n", __func__, __LINE__); ++ /* TODO shadow stats to netmdev */ ++ port = netmdev_get_netdev(mdev); ++ r8169_pause_datapath(port); ++ /* barrier required? */ ++ port->priv_flags |= IFF_VFNETDEV; ++ /* deallocate kernel buffers from ring */ ++ rtl8169_rx_clear(netdev_priv(port)); ++ ++ return 0; ++} ++ ++static void vf_netdev_release(struct mdev_device *mdev) ++{ ++ struct netmdev *nd = mdev_get_drvdata(mdev); ++ struct net_device *port; ++ int i; ++ ++ if (!nd) ++ return; ++ /* TODO export shadow stats to net_device */ ++ printk(KERN_INFO"%s %d\n", __func__, __LINE__); ++ for (i = 0; i < nd->mappings_count; i++, nd->mappings_count--) { ++ dma_unmap_single(nd->mappings[i].dev, ++ nd->mappings[i].iova, nd->mappings[i].size, ++ nd->mappings[i].direction); ++ kfree(nd->mappings[i].vaddr); ++ } ++ port = netmdev_get_netdev(mdev); ++ if (port) { ++ struct rtl8169_private *tp; ++ tp = netdev_priv(port); ++ ++ /* replenish the rings with kernel buffers */ ++ rtl8169_rx_fill(tp); ++ ++ port->priv_flags &= ~IFF_VFNETDEV; ++ /* barrier required? */ ++ r8169_resume_datapath(port); ++ } ++ ++ return; ++} ++ ++static long vf_netdev_ioctl(struct mdev_device *mdev, unsigned int cmd, ++ unsigned long arg) ++{ ++ unsigned long minsz; ++ struct net_device *netdev; ++ struct netmdev* netmdev; ++ ++ if (!mdev) ++ return -EINVAL; ++ ++ netdev = netmdev_get_netdev(mdev); ++ netmdev = mdev_get_drvdata(mdev); ++ ++ if (!netdev || !netmdev) ++ return -ENODEV; ++ ++ switch (cmd) { ++ case VFIO_DEVICE_GET_INFO: ++ { ++ struct vfio_device_info info; ++ ++ minsz = offsetofend(struct vfio_device_info, num_irqs); ++ if (copy_from_user(&info, (void __user *)arg, minsz)) ++ return -EFAULT; ++ ++ if (info.argsz < minsz) ++ return -EINVAL; ++ ++ info.flags = VFIO_DEVICE_FLAGS_PCI; ++ /* ++ * FIXME - find the number of rx queues when not having ++ * CONFIG_SYSFS if not possible to do it in a generic way, plan ++ * for a callback ++ */ ++ /* rx_ring and tx_ring*/ ++ info.num_regions = VFIO_PCI_NUM_REGIONS + netdev->num_tx_queues + 1; ++ info.num_irqs = 1; ++ ++ if (copy_to_user((void __user *)arg, &info, minsz)) ++ return -EFAULT; ++ ++ return 0; ++ } ++ case VFIO_DEVICE_GET_REGION_INFO: ++ { ++ struct vfio_region_info info; ++ int ret; ++ ++ minsz = offsetofend(struct vfio_region_info, offset); ++ ++ if (copy_from_user(&info, (void __user *)arg, minsz)) ++ return -EFAULT; ++ ++ if (info.argsz < minsz) ++ return -EINVAL; ++ ++ ret = r8169_get_region(netdev, &info); ++ ++ if (ret < 0) return ret; ++ ++ if (copy_to_user((void __user *)arg, &info, minsz)) ++ return -EFAULT; ++ ++ return 0; ++ } ++ case VFIO_IOMMU_MAP_DMA: ++ { ++ struct vfio_iommu_type1_dma_map map; ++ struct vm_area_struct *vma; ++ void *data; ++ struct device* parent_dev; ++ int node; ++ dma_addr_t mapping; ++ int ret = -EINVAL; ++ ++ /* allocate DMA area and map it where the userland asks ++ * userland need to mmap an area WITHOUT allocating pages: ++ * mmap(vaddr,size, PROT_READ | PROT_WRITE, MAP_SHARED | ++ * MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, -1, 0 ++ * MAP_NORESERVE ensures only VA space is booked, no pages are ++ * mapped * the mapping must be the entire area, not partial on ++ * the vma ++ */ ++ ++ if (netmdev->mappings_count >= 128) ++ return -EFAULT; ++ ++ minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); ++ ++ if (copy_from_user(&map, (void __user *)arg, minsz)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ if (map.argsz < minsz) ++ goto out; ++ ++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: find_vma(%llx)\n", map.vaddr); ++ /* ++ * locates the containing vma for the required map.vaddr ++ * the vma must point to the entire zone allocated by mmap in ++ * userland ++ */ ++ vma = find_vma(current->mm, map.vaddr); ++ if (!vma) ++ return -EFAULT; ++ if (map.vaddr >= vma->vm_end) ++ return -EFAULT; ++ ++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: found vma(%llx) -> start=%lx end=%lx pg_off=%lx\n", ++ map.vaddr, vma->vm_start, vma->vm_end, vma->vm_pgoff); ++ /* the iova will be returned as part of the ioctl to the userland */ ++ //parent_dev = &tp->pci_dev->dev; ++ parent_dev = mdev_parent_dev(mdev); ++ ++ node = netdev->dev.parent ? dev_to_node(netdev->dev.parent) : -1; ++ data = kmalloc_node(map.size, GFP_KERNEL, node); ++ if (!data) ++ /* return ret? */ ++ return -ENOMEM; ++ ++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: about to dma_map_single(%p, %p, %lld, DMA_FROM_DEVICE)\n", ++ parent_dev, data, map.size); ++ mapping = dma_map_single(parent_dev, data, map.size, ++ DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(parent_dev, mapping))) { ++ if (net_ratelimit()) ++ printk(KERN_ERR"Failed to dma_map_single buffer for userland!\n"); ++ kfree(data); ++ goto out; ++ } ++ map.iova = mapping; ++ ret = io_remap_pfn_range(vma, map.vaddr, ++ virt_to_phys(data) >> PAGE_SHIFT, ++ map.size, vma->vm_page_prot); ++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: io_remap_pfn_range %llx -> physmem <- @%llx, %lld:%d\n", ++ map.vaddr, map.iova, map.size, ret); ++ if (ret != 0) { ++ dma_unmap_single(parent_dev, mapping, map.size, ++ DMA_FROM_DEVICE); ++ kfree(data); ++ printk(KERN_ERR"VFIO_IOMMU_MAP_DMA: io_remap_pfn_range failed\n"); ++ return -EFAULT; ++ } ++ ++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: recording the mapping %d\n", ++ netmdev->mappings_count); ++ netmdev->mappings[netmdev->mappings_count].dev = parent_dev; ++ netmdev->mappings[netmdev->mappings_count].vaddr = data; ++ netmdev->mappings[netmdev->mappings_count].iova = mapping; ++ netmdev->mappings[netmdev->mappings_count].size = map.size; ++ netmdev->mappings_count++; ++ ++ printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: preparing response back to user\n"); ++ if (copy_to_user((void __user *)arg, &map, minsz)) ++ return -EFAULT; ++ ++ ret = 0; ++out: ++ return ret; ++ } ++ ++ case 500: { ++ r8169_resume_datapath(netdev); ++ return 0; ++ } ++ ++ } /* switch */ ++ ++ return -EINVAL; ++ ++} ++ ++static int vf_netdev_mmap(struct mdev_device *mdev, ++ struct vm_area_struct *vma) ++{ ++ struct net_device* netdev; ++ struct rtl8169_private *tp; ++ u64 req_len; ++ int ret = 0; ++ /* userland wants to access ring descrptors that was pre-allocated ++ * by the kernel ++ * note: userland need to user IOCTL MAP to CREATE packet buffers ++ */ ++ netdev = netmdev_get_netdev(mdev); ++ tp = netdev_priv(netdev); ++ ++ /* check that we try to map only authorized areas ++ * FIXME is there a way to check all the transmit and receive rings ++ * from an abstract netdev? ++ */ ++ if (vma->vm_pgoff != ((__u64)tp->RxDescArray >> PAGE_SHIFT) && ++ vma->vm_pgoff != ((__u64)tp->TxDescArray >> PAGE_SHIFT)) { ++ printk(KERN_INFO"invalid address\n"); ++ return -EINVAL; ++ } ++ ++ req_len = PAGE_ALIGN(vma->vm_end - vma->vm_start); ++ ++ vma->vm_private_data = NULL; ++ /* FIXME this should be uncached memory but it sounds the driver does ++ * not map in non cached. strange... ++ */ ++ //vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ++ ++ ret = remap_pfn_range(vma, vma->vm_start, ++ virt_to_phys((void*)(vma->vm_pgoff << PAGE_SHIFT)) >> PAGE_SHIFT, ++ req_len, vma->vm_page_prot); ++ ++ printk(KERN_INFO"vfnetdev_map %lx, @%llx, %lld:%d\n", ++ vma->vm_start, virt_to_phys((void*)(vma->vm_pgoff << PAGE_SHIFT)), ++ req_len, ret); ++ ++ return ret; ++} ++ ++static const struct mdev_parent_ops vf_netdev_ops = { ++ .supported_type_groups = sysfs_type_list, ++ .mdev_attr_groups = sysfs_mdev_groups, ++ .create = vf_netdev_create, ++ .remove = vf_netdev_remove, ++ ++ .open = vf_netdev_open, ++ .release = vf_netdev_release, ++ ++ .read = NULL, ++ .write = NULL, ++ .mmap = vf_netdev_mmap, ++ .ioctl = vf_netdev_ioctl, ++}; ++ ++#endif ++ + static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + { + const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data; +@@ -8207,6 +8772,13 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + MODULENAME, RTL8169_VERSION); + } + ++#if defined(VFIO_MDEV_DEVICE) || defined(VFIO_MDEV_DEVICE_MODULE) ++ if (mdev_register_device(&pdev->dev, &vf_netdev_ops) < 0) ++ printk(KERN_ERR"Could not register device\n"); ++ else ++ printk(KERN_INFO"Successfully registered vf-netdev device\n"); ++#endif ++ + dev = alloc_etherdev(sizeof (*tp)); + if (!dev) { + rc = -ENOMEM; +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index f535779..8deea1c 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -1386,6 +1386,7 @@ enum netdev_priv_flags { + IFF_RXFH_CONFIGURED = 1<<25, + IFF_PHONY_HEADROOM = 1<<26, + IFF_MACSEC = 1<<27, ++ IFF_VFNETDEV = 1<<28, + }; + + #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..b0609f5 --- /dev/null +++ b/run.sh @@ -0,0 +1,52 @@ +#!/bin/sh +# Only supported driver for now is r8169 ,ake it configurable in the future +driver='r8169' +sys_drv_name="$driver"'-vfnetdev' +intf='enp4s0' + +usage() { + echo "$0 create/destroy . Default $intf" + exit 1 +} + +[ $# -lt 1 ] && usage +[ -n "$2" ] && intf="$2" + +echo "Checking for interface $intf" +if [ ! -e "/sys/class/net/$intf/device/mdev_supported_types/$sys_drv_name/create" ]; then + echo "interface $intf has no vfio-mdev support" + exit 1 +fi + +vf_create() { + dev_uuid=$(uuidgen) + sudo sh -c "echo $dev_uuid > /sys/class/net/$intf/device/mdev_supported_types/$sys_drv_name/create" + #sudo sh -c "echo $dev_uuid > /sys/class/net/$intf/mdev_supported_types/net-vfnetdev/create" + #the newly created mdev is not tied to any port of the parent deice yet + echo "Bind $intf to the newly created mdevice $dev_uuid" + sudo sh -c "echo $intf > /sys/bus/mdev/devices/$dev_uuid/vfnetdev/netdev" + #ensure the IOMMU group is readble by non root program + vfio_group=$(basename $(readlink /sys/bus/mdev/devices/$dev_uuid/iommu_group)) + user=$(whoami) + grp=$(id -g -n $user) + sudo chown "$user":"$grp" /dev/vfio/$vfio_group + echo "created $dev_uuid" + echo "Run ./r8169 $vfio_group $dev_uuid" +} + +vf_destroy() { + # FIXME only one mdev per ethernet supported for now + echo 1 > /sys/class/mdev_bus/$intf/*/remove > /dev/null +} + +case "$1" in + create) + vf_create + ;; + destroy) + vf_destroy + ;; + *) + usage + ;; +esac -- cgit v1.2.3