linux 内核协议栈收报流程（一）ixgbe网卡驱动

首先模块加载insmod ixgbe.ko

module_init(ixgbe_init_module);

module_init(ixgbe_init_module);

{

	int ret;

	pr_info("%s - version %s\n", ixgbe_driver_string, ixgbe_driver_version);

	pr_info("%s\n", ixgbe_copyright);

	ixgbe_dbg_init();

 　　　　ret = pci_register_driver(&ixgbe_driver);

	if (ret) {

		ixgbe_dbg_exit();

		return ret;

	}

#ifdef CONFIG_IXGBE_DCA

	dca_register_notify(&dca_notifier);

#endif

	return 0;

}

于是看pci设备的核心结构体

static struct pci_driver ixgbe_driver = {

	.name     = ixgbe_driver_name,

	.id_table = ixgbe_pci_tbl,

	.probe    = ixgbe_probe,

	.remove   = ixgbe_remove,

#ifdef CONFIG_PM

	.suspend  = ixgbe_suspend,

	.resume   = ixgbe_resume,

#endif

	.shutdown = ixgbe_shutdown,

	.sriov_configure = ixgbe_pci_sriov_configure,

	.err_handler = &ixgbe_err_handler

};

当设备加载成功后，会执行ixgbe_probe函数

static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)

{

        /*分配struct net_device *netdev 结构体*/

	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), indices);

	if (!netdev) {

		err = -ENOMEM;

		goto err_alloc_etherdev;

	}

	SET_NETDEV_DEV(netdev, &pdev->dev);

        /*分配struct ixgbe_adapter *adapter结构体*/

	adapter = netdev_priv(netdev);

        /*分配dev结构体的ops函数指针集合*/

	netdev->netdev_ops = &ixgbe_netdev_ops;

	err = ixgbe_sw_init(adapter);

	err = ixgbe_init_interrupt_scheme(adapter);

        /*设备注册完毕*/
　　　　 err = register_netdev(netdev);

}

重点看ixgbe_init_interrupt_scheme(adapter)函数，该函数里面会初始化adapter结构体以及napi相关的东西

int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter)

{

	err = ixgbe_alloc_q_vectors(adapter);

}

static int ixgbe_alloc_q_vectors(struct ixgbe_adapter *adapter)

{

	if (q_vectors >= (rxr_remaining + txr_remaining)) {

		for (; rxr_remaining; v_idx++) {

			err = ixgbe_alloc_q_vector(adapter, q_vectors, v_idx,

						   0, 0, 1, rxr_idx);

			if (err)

				goto err_out;

			/* update counts and index */

			rxr_remaining--;

			rxr_idx++;

		}

	}

}

static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,

				int v_count, int v_idx,

				int txr_count, int txr_idx,

				int rxr_count, int rxr_idx)

{

	/* setup affinity mask and node */

	if (cpu != -1)

		cpumask_set_cpu(cpu, &q_vector->affinity_mask);

	q_vector->numa_node = node;

#ifdef CONFIG_IXGBE_DCA

	/* initialize CPU for DCA */

	q_vector->cpu = -1;

#endif

	/* initialize NAPI */

	netif_napi_add(adapter->netdev, &q_vector->napi,

		       ixgbe_poll, 64);

	napi_hash_add(&q_vector->napi);

}

到此为止，网卡设置初始化完毕　　

其中涉及到如下几个结构体

ixgbe_adapter

/* board specific private data structure */

struct ixgbe_adapter {

//发送的rings

struct ixgbe_ring *tx_ring[MAX_TX_QUEUES] ____cacheline_aligned_in_smp;

//接收的rings

struct ixgbe_ring *rx_ring[MAX_RX_QUEUES];

//这个vector里面包含了napi结构

//应该是跟下面的entries一一对应起来做为是一个中断向量的东西吧

struct ixgbe_q_vector *q_vector[MAX_Q_VECTORS];

//这个里面估计是MSIX的多个中断对应的响应接口

struct msix_entry *msix_entries;

}

struct ixgbe_q_vector {

        struct ixgbe_adapter *adapter;

ifdef CONFIG_IXGBE_DCA

        int cpu;            /* CPU for DCA */

#endif

        u16 v_idx;              /* index of q_vector within array, also used for

                                 * finding the bit in EICR and friends that

                                 * represents the vector for this ring */

        u16 itr;                /* Interrupt throttle rate written to EITR */

        struct ixgbe_ring_container rx, tx;

        struct napi_struct napi;/*napi结构体*/

       cpumask_t affinity_mask;

        int numa_node;

        struct rcu_head rcu;    /* to avoid race with update stats on free */

        char name[IFNAMSIZ + 9];

        /* for dynamic allocation of rings associated with this q_vector */

        struct ixgbe_ring ring[0] ____cacheline_internodealigned_in_smp;

};

struct napi_struct {

    /* The poll_list must only be managed by the entity which

     * changes the state of the NAPI_STATE_SCHED bit.  This means

     * whoever atomically sets that bit can add this napi_struct

     * to the per-cpu poll_list, and whoever clears that bit

     * can remove from the list right before clearing the bit.

     */

    struct list_head    poll_list;

    unsigned long       state;

    int         weight;

    unsigned int        gro_count;

    int         (*poll)(struct napi_struct *, int);//poll的接口实现

#ifdef CONFIG_NETPOLL

    spinlock_t      poll_lock;

    int         poll_owner;

#endif

    struct net_device   *dev;

    struct sk_buff      *gro_list;

    struct sk_buff      *skb;

    struct list_head    dev_list;

};

然后当我们ifconfig dev up 时，会执行dev_ops->open函数

static int ixgbe_open(struct net_device *netdev)

{

	/* allocate transmit descriptors */

	err = ixgbe_setup_all_tx_resources(adapter);

	if (err)

		goto err_setup_tx;

	/* allocate receive descriptors */

	err = ixgbe_setup_all_rx_resources(adapter);

       /*注册中断*/

       err = ixgbe_request_irq(adapter);

}

static int ixgbe_request_irq(struct ixgbe_adapter *adapter)

{

	struct net_device *netdev = adapter->netdev;

	int err;

	if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED)

		err = ixgbe_request_msix_irqs(adapter);

	else if (adapter->flags & IXGBE_FLAG_MSI_ENABLED)

		err = request_irq(adapter->pdev->irq, ixgbe_intr, 0,

				  netdev->name, adapter);

	else

		err = request_irq(adapter->pdev->irq, ixgbe_intr, IRQF_SHARED,

				  netdev->name, adapter);

	if (err)

		e_err(probe, "request_irq failed, Error %d\n", err);

	return err;

}

static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)

{

	for (vector = 0; vector < adapter->num_q_vectors; vector++) {

		struct ixgbe_q_vector *q_vector = adapter->q_vector[vector];

		struct msix_entry *entry = &adapter->msix_entries[vector];

		err = request_irq(entry->vector, &ixgbe_msix_clean_rings, 0,

				  q_vector->name, q_vector);

}

从上面的代码流程可以看出，最终注册的中断处理函数为ixgbe_msix_clean_rings

static irqreturn_t ixgbe_msix_clean_rings(int irq, void *data)

{

	struct ixgbe_q_vector *q_vector = data;

	/* EIAM disabled interrupts (on this vector) for us */

	if (q_vector->rx.ring || q_vector->tx.ring)

		napi_schedule(&q_vector->napi);

	return IRQ_HANDLED;

}

从上述代码中可以看，该中断处理函数仅仅作为napi的调度者

当数据包到来时，首先唤醒硬中断执行ixgbe_msix_clean_rings函数，最终napi_schedule会调用__raise_softirq_irqoff去触发一个软中断NET_RX_SOFTIRQ，然后又对应的软中断接口去实现往上的协议栈逻辑

然后看看napi 调度函数都做了些什么工作

static inline void napi_schedule(struct napi_struct *n)

{

	if (napi_schedule_prep(n))

		__napi_schedule(n);

}

void __napi_schedule(struct napi_struct *n)

{

	unsigned long flags;

	local_irq_save(flags);

	____napi_schedule(this_cpu_ptr(&softnet_data), n);

	local_irq_restore(flags);

}

最终可以看出napi调度函数把napi结构体挂到了per cpu的私有数据结构softnet_data上

struct softnet_data {

	struct Qdisc		*output_queue;

	struct Qdisc		**output_queue_tailp;

	struct list_head	poll_list;

	struct sk_buff		*completion_queue;

	struct sk_buff_head	process_queue;

	/* stats */

	unsigned int		processed;

	unsigned int		time_squeeze;

	unsigned int		cpu_collision;

	unsigned int		received_rps;

#ifdef CONFIG_RPS

	struct softnet_data	*rps_ipi_list;

	/* Elements below can be accessed between CPUs for RPS */

	struct call_single_data	csd ____cacheline_aligned_in_smp;

	struct softnet_data	*rps_ipi_next;

	unsigned int		cpu;

	unsigned int		input_queue_head;

	unsigned int		input_queue_tail;

#endif

	unsigned int		dropped;

	struct sk_buff_head	input_pkt_queue;

	struct napi_struct	backlog;/*napi结构体里面的双向链表中*/

};

NET_RX_SOFTIRQ是收到数据包的软中断信号对应的接口是net_rx_action
NET_TX_SOFTIRQ是发送完数据包后的软中断信号对应的接口是net_tx_action　　

static void net_rx_action(struct softirq_action *h)

{

        /* 获取每个cpu的数据*/

	struct softnet_data *sd = this_cpu_ptr(&softnet_data);

        while (!list_empty(&sd->poll_list)) {

		struct napi_struct *n;

                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);               

		if (test_bit(NAPI_STATE_SCHED, &n->state)) {

			work = n->poll(n, weight);

			trace_napi_poll(n);

		}

}

于是就执行到初始化napi结构体中的poll函数，在这里为ixgbe_poll

int ixgbe_poll(struct napi_struct *napi, int budget)

{

	struct ixgbe_q_vector *q_vector =

				container_of(napi, struct ixgbe_q_vector, napi);

	struct ixgbe_adapter *adapter = q_vector->adapter;

	struct ixgbe_ring *ring;

	int per_ring_budget;

	bool clean_complete = true;

#ifdef CONFIG_IXGBE_DCA

	if (adapter->flags & IXGBE_FLAG_DCA_ENABLED)

		ixgbe_update_dca(q_vector);

#endif

	ixgbe_for_each_ring(ring, q_vector->tx)

		clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring);

	if (!ixgbe_qv_lock_napi(q_vector))

		return budget;

	/* attempt to distribute budget to each queue fairly, but don't allow

	 * the budget to go below 1 because we'll exit polling */

	if (q_vector->rx.count > 1)

		per_ring_budget = max(budget/q_vector->rx.count, 1);

	else

		per_ring_budget = budget;

	ixgbe_for_each_ring(ring, q_vector->rx)

		clean_complete &= (ixgbe_clean_rx_irq(q_vector, ring,

				   per_ring_budget) < per_ring_budget);

	ixgbe_qv_unlock_napi(q_vector);

	/* If all work not completed, return budget and keep polling */

	if (!clean_complete)

		return budget;

	/* all work done, exit the polling mode */

	napi_complete(napi);

	if (adapter->rx_itr_setting & 1)

		ixgbe_set_itr(q_vector);

	if (!test_bit(__IXGBE_DOWN, &adapter->state))

		ixgbe_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));

	return 0;

}

static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,

			       struct ixgbe_ring *rx_ring,

			       const int budget)

{

       ixgbe_rx_skb(q_vector, skb);

}

static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,

			 struct sk_buff *skb)

{

	if (ixgbe_qv_busy_polling(q_vector))

		netif_receive_skb(skb);

	else

		napi_gro_receive(&q_vector->napi, skb);

}

int netif_receive_skb(struct sk_buff *skb)

{

	int ret;

	net_timestamp_check(netdev_tstamp_prequeue, skb);

	if (skb_defer_rx_timestamp(skb))

		return NET_RX_SUCCESS;

	rcu_read_lock();

#ifdef CONFIG_RPS

	if (static_key_false(&rps_needed)) {

		struct rps_dev_flow voidflow, *rflow = &voidflow;

		int cpu = get_rps_cpu(skb->dev, skb, &rflow);

		if (cpu >= 0) {

			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

			rcu_read_unlock();

			return ret;

		}

	}

#endif

        /*最终协议栈开始收报*/

	ret = __netif_receive_skb(skb);

	rcu_read_unlock();

	return ret;

}

秒客网

linux 内核协议栈收报流程（一）ixgbe网卡驱动

相关文章