1

I want to write a simple test program to receive Ethernet frames using the ibverbs API.

The code below compiles and runs but never receives any packets. I'm using Mellanox ConnectX-3 hardware on Ubuntu 18.

Questions:

  1. If, while running this RX program, I ping the Inifiniband interface from another machine, then ping receives responses. I would not expect that because the ping requests should be grabbed by the RX program and the Linux IP stack should not see them and therefore should not respond. What should happen?

  2. Is there anything obvious wrong with my code?

  3. Do I need a steering rule? If I remove the call of ibv_create_flow() should I just receive all packets the interface sees?

#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>


#define PORT_NUM 1

#define MAX_MSG_SIZE 1500 // The maximum size of each received packet.
#define RQ_NUM_DESC 512 // Max packets that can be received without processing.

// The MAC of the interface we are listening on.
#define DEST_MAC { 0x00, 0x0d, 0x3a, 0x47, 0x1c, 0x2e }

#define FATAL_ERROR(msg, ...) { fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); exit(-1); }


int main() {
    // Get the list of devices.
    int num_devices = 0;
    struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
    if (!dev_list)
        FATAL_ERROR("Failed to get IB devices list.");

    // Choose the first device.
    struct ibv_device *ib_dev = dev_list[0];
    if (!ib_dev)
        FATAL_ERROR("IB device not found.");
    printf("Found %i Infiniband device(s).\n", num_devices);
    printf("Using device '%s'.\n", ibv_get_device_name(ib_dev));

    // Get the device context.
    struct ibv_context *context = ibv_open_device(ib_dev);
    if (!context)
        FATAL_ERROR("Couldn't get context for device.");

    // Allocate a protection domain (PD) that will group memory
    // regions (MR) and rings.
    struct ibv_pd *pd = ibv_alloc_pd(context);
    if (!pd)
        FATAL_ERROR("Couldn't allocate protection domain.");

    // Create Complition Queue (CQ).
    struct ibv_cq *cq = ibv_create_cq(context, RQ_NUM_DESC, NULL, NULL, 0);
    if (!cq)
        FATAL_ERROR("Couldn't create completion queue. errno = %d.", errno);

    // Create Queue Pair (QP).
    struct ibv_qp_init_attr qp_init_attr = {
        .qp_context = NULL,
        .send_cq = cq, // Report receive completion to CQ.
        .recv_cq = cq,

        .cap = {
            .max_send_wr = 0, // No send ring.
            .max_recv_wr = RQ_NUM_DESC, // Max num packets in ring.
            .max_recv_sge = 1, // Only one pointer per descriptor.
         },
        .qp_type = IBV_QPT_RAW_PACKET, // Use Ethernet packets.
    };
    struct ibv_qp *qp = ibv_create_qp(pd, &qp_init_attr);
    if (!qp)
        FATAL_ERROR("Couldn't create queue pair.");

    // Initialize the QP (receive ring) and assign a port.
    struct ibv_qp_attr qp_attr = { 0 };
    qp_attr.qp_state = IBV_QPS_INIT;
    qp_attr.port_num = PORT_NUM;
    int qp_flags = IBV_QP_STATE | IBV_QP_PORT;
    if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
        FATAL_ERROR("Failed to initialize queue pair.");

    // Move ring state to ready-to-receive. This is needed in
    // order to be able to receive packets.
    memset(&qp_attr, 0, sizeof(qp_attr));
    qp_flags = IBV_QP_STATE;
    qp_attr.qp_state = IBV_QPS_RTR;
    if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
        FATAL_ERROR("Failed to put queue pair into ready-to-receive state.");

    // Allocate memory for packet buffer.
    int buf_size = MAX_MSG_SIZE * RQ_NUM_DESC; // Maximum size of data to be accessed by hardware.
    void *buf = malloc(buf_size);
    if (!buf)
        FATAL_ERROR("Couldn't allocate memory.");

    // Register the user memory so it can be accessed by the HW directly.
    struct ibv_mr *mr = ibv_reg_mr(pd, buf, buf_size, IBV_ACCESS_LOCAL_WRITE);
    if (!mr)
        FATAL_ERROR("Couldn't register memory region.");

    // Create a scatter/gather entry.
    struct ibv_sge sg_entry;
    sg_entry.length = MAX_MSG_SIZE;
    sg_entry.lkey = mr->lkey;

    // Create a receive work request.
    struct ibv_recv_wr wr;
    wr.num_sge = 1;
    wr.sg_list = &sg_entry;
    wr.next = NULL;

    // Post a load of receive work requests onto the receive queue.
    struct ibv_recv_wr *bad_wr;
    for (int n = 0; n < RQ_NUM_DESC; n++) {
        // Each descriptor points to max MTU size buffer.
        sg_entry.addr = (uint64_t)buf + MAX_MSG_SIZE * n;

        // When a packet is received, a work completion will be created
        // corresponding to this work request. It will contain this field.
        wr.wr_id = n;

        // Post the receive buffer to the ring.
        int rv = ibv_post_recv(qp, &wr, &bad_wr);
        if (rv != 0) {
            FATAL_ERROR("Posting recv failed with error code %i.", rv);
        }
    }

    // Create steering rule.
    struct raw_eth_flow_attr {
        struct ibv_flow_attr attr;
        struct ibv_flow_spec_eth spec_eth;
    } __attribute__((packed)) flow_attr = {
        .attr = {
            .comp_mask = 0,
            .type = IBV_FLOW_ATTR_NORMAL,
            .size = sizeof(flow_attr),
            .priority = 0,
            .num_of_specs = 1,
            .port = PORT_NUM,
            .flags = 0,
        },
        .spec_eth = {
            .type = IBV_FLOW_SPEC_ETH,
            .size = sizeof(struct ibv_flow_spec_eth),
            .val = {
                .dst_mac = DEST_MAC,
                .src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
                .ether_type = 0,
                .vlan_tag = 0,
            },
            .mask = {
                .dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
                .src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
                .ether_type = 0,
                .vlan_tag = 0,
            }
        }
    };

    // Register steering rule to intercept packet to DEST_MAC and place packet in
    // ring pointed by qp.
    struct ibv_flow *eth_flow = ibv_create_flow(qp, &flow_attr.attr);
    if (!eth_flow)
        FATAL_ERROR("Couldn't attach steering flow. Does DEST_MAC match that of the local NIC?");

    printf("Receiving.\n");
    while (1) {
        // Wait for CQ event upon message received, and print a message
        struct ibv_wc wc;
        int msgs_completed = ibv_poll_cq(cq, 1, &wc);
        if (msgs_completed > 0) {
            printf("Message %ld received size %d\n", wc.wr_id, wc.byte_len);
            sg_entry.addr = (uint64_t)buf + wc.wr_id * MAX_MSG_SIZE;
            wr.wr_id = wc.wr_id;

            // After processed need to post back the buffer.
            int rv = ibv_post_recv(qp, &wr, &bad_wr);
            if (rv != 0) {
                FATAL_ERROR("Re-posting recv failed with error code %i.", rv);
            }
        }
        else if (msgs_completed < 0) {
            FATAL_ERROR("Polling error.");
        }
    }
}
Andrew Bainbridge
  • 4,651
  • 3
  • 35
  • 50
  • What do you mean by the ethernet frames? Since IB is a different protocol with compare to ethernet. Could you elaborate your question with more details? – Janaka Aug 28 '19 at 08:43
  • I'm not using the IB protocol. The NICs are wired to an Ethernet switch. The only reason I mention infiniband is because the only API I know that can be used with the Mellanox cards is the InfiniBand Verbs API (IBV). It appears IBV was extended to add Ethernet support. For example, DPDK (an Ethernet based system) implements the Mellanox drivers on top of IBV. – Andrew Bainbridge Aug 28 '19 at 09:21
  • If IB NIC is connected to an ethernet switch then your network is capable of ROCE [link](https://en.wikipedia.org/wiki/RDMA_over_Converged_Ethernet). An application point of view RoCE no different from compared to IB, the only thing is the network transport protocol. Hence you can read the RoCE communication via a receiver side implementation of IB – Janaka Aug 28 '19 at 09:46
  • I don't want to do RDMA. I'm trying to implement things like an IP tunnel or a load balancer, hence the need to see the Ethernet frames. – Andrew Bainbridge Aug 28 '19 at 12:08
  • 1
    The problem is on the mask for src_mac – badr zarhri Aug 15 '21 at 08:15

2 Answers2

2

Take a look at this example from Nvidia: https://enterprise-support.nvidia.com/s/article/raw-ethernet-programming--basic-introduction---code-example

To receive everything the interface sees, you can use the experimental api #include <infiniband/verbs_exp.h>, then when creating the steering rule, use ibv_exp_flow_attr and set the type to IBV_EXP_FLOW_ATTR_SNIFFER.

Joe'
  • 376
  • 6
  • 10
  • The article you are linking to seems to be moved to this place: https://enterprise-support.nvidia.com/s/article/raw-ethernet-programming--basic-introduction---code-example – xmoex Mar 04 '23 at 17:50
1

Please refer to https://github.com/Mellanox/libvma/wiki/Architecture VMA implements native RDMA verbs API. The native RDMA verbs have been extended into the Ethernet RDMA-capable NICs, enabling the packets to pass directly between the user application and the InfiniBand HCA or Ethernet NIC, bypassing the kernel and its TCP/UDP handling network stack.

Phatut
  • 56
  • 4