linux 自定义模块来缓存skb的意义

linux中，管理网卡收发报文的结构是sk_buff,这个结构比freebsd中的m_buf复杂的多，这个也是为什么现在用户态协议栈大多采用bsd为基础来实现的一个原因。

struct sk_buff {

    /* These two members must be first. */

    struct sk_buff        *next;-------------有没有人想过为啥不用内核标准的list来，而是单独两个成员？如果你有好的想法，可以跟我讨论。

    struct sk_buff        *prev;

#ifdef __GENKSYMS__

    ktime_t        tstamp;

#else

    union {

        ktime_t        tstamp;

        struct skb_mstamp skb_mstamp;

    };

#endif

    struct sock        *sk;

    struct net_device    *dev;

    /*

     * This is the control buffer. It is free to use for every

     * layer. Please put your private variables there. If you

     * want to keep them across layers you have to do a skb_clone()

     * first. This is owned by whoever has the skb queued ATM.

     */

    char            cb[] __aligned();------------------------------------这个可以看很多private的处理

    unsigned long        _skb_refdst;

#ifdef CONFIG_XFRM

    struct    sec_path    *sp;

#endif

    unsigned int        len,

                data_len;

    __u16            mac_len,

                hdr_len;

    union {

        __wsum        csum;

        struct {

            __u16    csum_start;

            __u16    csum_offset;

        };

    };

    __u32            priority;

    kmemcheck_bitfield_begin(flags1);

    __u8            RH_KABI_RENAME(local_df, ignore_df):,

                cloned:,

                ip_summed:,

                nohdr:,

                nfctinfo:;

    __u8            pkt_type:,

                fclone:,

                ipvs_property:,

                peeked:,

                nf_trace:;

    kmemcheck_bitfield_end(flags1);

    __be16            protocol;

    void            (*destructor)(struct sk_buff *skb);

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)

    struct nf_conntrack    *nfct;

#endif

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)

    struct nf_bridge_info    *nf_bridge;

#endif

    /* fields enclosed in headers_start/headers_end are copied

     * using a single memcpy() in __copy_skb_header()

     */

    /* private: */

    RH_KABI_EXTEND(__u32    headers_start[])

    /* public: */

    int            skb_iif;

    RH_KABI_REPLACE(__u32    rxhash,

            __u32    hash)

    __be16            vlan_proto;

    __u16            vlan_tci;

#ifdef CONFIG_NET_SCHED

    __u16            tc_index;    /* traffic control index */

#ifdef CONFIG_NET_CLS_ACT

    __u16            tc_verd;    /* traffic control verdict */

#endif

#endif

    __u16            queue_mapping;

    kmemcheck_bitfield_begin(flags2);

#ifdef CONFIG_IPV6_NDISC_NODETYPE

    __u8            ndisc_nodetype:;

#endif

    __u8            pfmemalloc:;

    __u8            ooo_okay:;

    __u8            RH_KABI_RENAME(l4_rxhash, l4_hash):;

    __u8            wifi_acked_valid:;

    __u8            wifi_acked:;

    __u8            no_fcs:;

    __u8            head_frag:;

    /* Indicates the inner headers are valid in the skbuff. */

    __u8            encapsulation:;

    RH_KABI_EXTEND(__u8            encap_hdr_csum:)-----------------------这个使用在我之前一篇博客中有描述。

    RH_KABI_EXTEND(__u8            csum_valid:)

    RH_KABI_EXTEND(__u8            csum_complete_sw:)

    RH_KABI_EXTEND(__u8            xmit_more:)

    RH_KABI_EXTEND(__u8            inner_protocol_type:)

    RH_KABI_EXTEND(__u8            remcsum_offload:)

    /* 0/2 bit hole (depending on ndisc_nodetype presence) */

    kmemcheck_bitfield_end(flags2);

#if defined CONFIG_NET_DMA_RH_KABI || defined CONFIG_NET_RX_BUSY_POLL || defined CONFIG_XPS

    union {

        unsigned int    napi_id;

        RH_KABI_EXTEND(unsigned int    sender_cpu)

        RH_KABI_DEPRECATE(dma_cookie_t,    dma_cookie)

    };

#endif

#ifdef CONFIG_NETWORK_SECMARK

    __u32            secmark;

#endif

    union {

        __u32        mark;

        __u32        dropcount;

        __u32        reserved_tailroom;

    };

#ifdef __GENKSYMS__

    __be16            inner_protocol;

#else

    union {

        __be16        inner_protocol;

        __u8        inner_ipproto;

    };

#endif

    __u16            inner_transport_header;

    __u16            inner_network_header;

    __u16            inner_mac_header;

    __u16            transport_header;

    __u16            network_header;

    __u16            mac_header;

    RH_KABI_EXTEND(kmemcheck_bitfield_begin(flags3))

    RH_KABI_EXTEND(__u8    csum_level:)

    RH_KABI_EXTEND(__u8    rh_csum_pad:)

    RH_KABI_EXTEND(__u8    csum_bad:)

    RH_KABI_EXTEND(__u8    offload_fwd_mark:)

    RH_KABI_EXTEND(__u8    sw_hash:)

    RH_KABI_EXTEND(__u8     csum_not_inet:)

    RH_KABI_EXTEND(__u8    dst_pending_confirm:)

    /* 8 bit hole */

    RH_KABI_EXTEND(kmemcheck_bitfield_end(flags3))

    /* private: */

    RH_KABI_EXTEND(__u32    headers_end[])

    /* public: */

    /* RHEL SPECIFIC

     *

     * The following padding has been inserted before ABI freeze to

     * allow extending the structure while preserve ABI. Feel free

     * to replace reserved slots with required structure field

     * additions of your backport, eventually moving the replaced slot

     * before headers_end, if it need to be copied by __copy_skb_header()

     */

    u32            rh_reserved1;

    u32            rh_reserved2;

    u32            rh_reserved3;

    u32            rh_reserved4;

    /* These elements must be at the end, see alloc_skb() for details.  */

    sk_buff_data_t        tail;

    sk_buff_data_t        end;

    unsigned char        *head,

                *data;

    unsigned int        truesize;

    atomic_t        users;

};

skb是管理结构，目前linux 3.10是使用slub的方式来管理skb的缓存，但这个管理有没有什么问题？

据说曾经有人测试过，在2G主频上的cpu从slab中分配一个skb需要耗时4us，这个我自己没有测试过，我想如果是获取的slab位于percpu上，应该不需要这么长时间。但是只要看过slab的管理的话，应该也明白这个消耗虽然比从buddly中分配要小很多，但是从绝对值来说也不会太少。对于大量需要消耗skb的网络服务器来说，申请skb和释放skb的消耗就显得比较重了。

解决方法：

1.做一个自己的skb的缓存池，这样申请的时候，不走slab，释放的时候，也只是减少引用计数，不需要还给slab。

2.显然应该实现为percpu的模式，避免查找skb的时候，锁的消耗。

3.percpu的缓存池消耗不一，所以要有一个cpu的node级别的二级缓存池，保证各个核在的增长和消耗达到阈值之后有一个平衡，如果没绑定的话，比如对于某个流来说，完全可能在Acpu上申请，在Bcpu上释放，不要问我为什么不绑定，因为假设你用docker的话，不太适合绑定，因为虚拟网卡一般配置单队列，而且开启xps的话，是根据当前cpuid而不是根据四元组来选择tx，可能选择到不同的tx来发送报文，而各个tx明显会有忙闲的区别，docker中程序的发包很可能导致tcp乱序严重。

4.在物理机上，有条件的话尽量做flow的绑定，也就是说，某个flow只在某个cpu上运行，保证局部性，这样的话，因为flow可以和一部分skb关联，这样填充二层和三层头的动作都可以节省了，如果你觉得节省这一点无所谓的话，大可以测试一下节省掉这些memcpy的消耗，从我们的测试效果看，很好。如果把skb看做一辆货车的话，对应的二层和三层的头就可以看做司机了，你说

我们运点货，犯得着老换货车，老换司机么？肯定有人会说，我每次发送的位置又不一样，所以得拷贝二层和三层的头，那假设你大多数发包的时候长度一样呢？

5.因为是自己管理skb的缓存，所以cb 成员就可以随便怎么处理了，很多控制信息都可以放在这里，方便。

6.对于单个流需要的流量比较多的情况，比如流媒体服务器，高清发送，预占的skb可以保证时延可控。

缺点：

1.预占部分内存，通过drop_cache也放不掉，如果是缓存自增长和收缩模式，也会有一些消耗。

秒客网

linux 自定义模块来缓存skb的意义

相关文章