struct sk_buff与struct socket及struct sock 结构体分析

时间:2023-02-08 11:00:29

sk_buff是Linux网络协议栈最重要的数据结构之一,该数据结构贯穿于整个数据包处理的流程。由于协议采用分层结构,上层向下层传递数据时需要增加包头,下层向上层数据时又需要去掉包头。sk_buff中保存了L2,L3,L4层的头指针,这样在层传递时只需要对数据缓冲区改变头部信息,并调整sk_buff中的指针,而不需要拷贝数据,这样大大减少了内存拷贝的需要。

/** 
*struct sk_buff - socket buffer
*@next: Next buffer in list
*@prev: Previous buffer in list
*@tstamp: Time we arrived
*@sk: Socket we are owned by
*@dev: Device we arrived on/are leaving by
*@cb: Control buffer. Free for use by every layer. Put private vars here
*@_skb_refdst: destination entry (with norefcount bit)
*@sp: the security path, used for xfrm
*@len: Length of actual data
*@data_len: Data length
*@mac_len: Length of link layer header
*@hdr_len: writable header length of cloned skb
*@csum: Checksum (must include start/offset pair)
*@csum_start: Offset from skb->head where checksumming should start
*@csum_offset: Offset from csum_start where checksum should be stored
*@priority: Packet queueing priority
*@local_df: allow local fragmentation
*@cloned: Head may be cloned (check refcnt to be sure)
*@ip_summed: Driver fed us an IP checksum
*@nohdr: Payload reference only, must not modify header
*@nfctinfo: Relationship of this skb to the connection
*@pkt_type: Packet class
*@fclone: skbuff clone status
*@ipvs_property: skbuff is owned by ipvs
*@peeked: this packet has been seen already, so stats have been
*done for it, don't do them again
*@nf_trace: netfilter packet trace flag
*@protocol: Packet protocol from driver
*@destructor: Destruct function
*@nfct: Associated connection, if any
*@nfct_reasm: netfilter conntrack re-assembly pointer
*@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
*@skb_iif: ifindex of device we arrived on
*@tc_index: Traffic control index
*@tc_verd: traffic control verdict
*@rxhash: the packet hash computed on receive
*@queue_mapping: Queue mapping for multiqueue devices
*@ndisc_nodetype: router type (from link layer)
*@ooo_okay: allow the mapping of a socket to a queue to be changed
*@l4_rxhash: indicate rxhash is a canonical 4-tuple hash over transport
*ports.
*@wifi_acked_valid: wifi_acked was set
*@wifi_acked: whether frame was acked on wifi or not
*@no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
*@dma_cookie: a cookie to one of several possible DMA operations
*done by skb DMA functions
*@secmark: security marking
*@mark: Generic packet mark
*@dropcount: total number of sk_receive_queue overflows
*@vlan_tci: vlan tag control information
*@inner_transport_header: Inner transport layer header (encapsulation)
*@inner_network_header: Network layer header (encapsulation)
*@transport_header: Transport layer header
*@network_header: Network layer header
*@mac_header: Link layer header
*@tail: Tail pointer
*@end: End pointer
*@head: Head of buffer
*@data: Data head pointer
*@truesize: Buffer size
*@users: User count - see {datagram,tcp}.c
*/

struct sk_buff {
/* These two members must be first. */
struct sk_buff*next;
struct sk_buff*prev;

ktime_ttstamp;

struct sock*sk;
struct net_device*dev;

/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
charcb[48] __aligned(8);

unsigned long_skb_refdst;
#ifdef CONFIG_XFRM
structsec_path*sp;
#endif
unsigned intlen,
data_len;
__u16mac_len,
hdr_len;
union {
__wsumcsum;
struct {
__u16csum_start;
__u16csum_offset;
};
};
__u32priority;
kmemcheck_bitfield_begin(flags1);
__u8local_df:1,
cloned:1,
ip_summed:2,
nohdr:1,
nfctinfo:3;
__u8pkt_type:3,
fclone:2,
ipvs_property:1,
peeked:1,
nf_trace:1;
kmemcheck_bitfield_end(flags1);
__be16protocol;

void(*destructor)(struct sk_buff *skb);
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack*nfct;
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
struct sk_buff*nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info*nf_bridge;
#endif

intskb_iif;

__u32rxhash;

__u16vlan_tci;

#ifdef CONFIG_NET_SCHED
__u16tc_index;/* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16tc_verd;/* traffic control verdict */
#endif
#endif

__u16queue_mapping;
kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8ndisc_nodetype:2;
#endif
__u8pfmemalloc:1;
__u8ooo_okay:1;
__u8l4_rxhash:1;
__u8wifi_acked_valid:1;
__u8wifi_acked:1;
__u8no_fcs:1;
__u8head_frag:1;
/* Encapsulation protocol and NIC drivers should use
* this flag to indicate to each other if the skb contains
* encapsulated packet or not and maybe use the inner packet
* headers if needed
*/
__u8encapsulation:1;
/* 7/9 bit hole (depending on ndisc_nodetype presence) */
kmemcheck_bitfield_end(flags2);

#ifdef CONFIG_NET_DMA
dma_cookie_tdma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32secmark;
#endif
union {
__u32mark;
__u32dropcount;
__u32reserved_tailroom;
};

sk_buff_data_tinner_transport_header;
sk_buff_data_tinner_network_header;
sk_buff_data_ttransport_header;
sk_buff_data_tnetwork_header;
sk_buff_data_tmac_header;
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_ttail;
sk_buff_data_tend;
unsigned char*head,
*data;
unsigned inttruesize;
atomic_tusers;
};

struct sk_buff {//介绍 
    struct sk_buff *next *prev;//双向链表指针
    ktime_t tstamp ;//时间撮
    struct sock *sk;   //对应于传输层,标示属于哪个socket ?
    struct net_device *dev;    //数据来自或者发送自哪个设备
    char cb[48];//控制信息buffer,在每个层都可以用,并且目前为止足够大
    int len;      实际总长度
    int data_len; 数据的长度 //也许是paged的data 
    __u16 mac_len; 数据链路层头的长度
    __u16 hdr_len; writable header length of cloned skb 
    
     sk_buff_data_t   transport_header;   传输层头指针
    sk_buff_data_t   network_header;    网络层头指针
    sk_buff_data_t   mac_header;        数据链路层头

    unsigned char *head; //buffer 头
    unsigned char *data; 数据头
    sk_buff_data_t tail; 数据结尾
    sk_buff_data_t end;  buffer 结尾
    unsigned int truesize; //bufffer 大小

    cloned 是不是cloned
    mark 数据包mark
    destructor 销毁函数指针 
    pkt_type : 根据二层头确定的包信息
    __be16 protocol : 三层协议 IP ARP 等,用于和全局数组qtype_base中的数据对比,该数组可以通过dev_add_pack()注册.
}

由于该结构将用于各个层,内核提供了一系列的sk_buff的操作函数
skb_put()  减小tailroom,buffer下后扩展
skb_push() 减小headroom,buffer向上扩张 
skb_trim() cut buffer到一个长度
skb_pull   从数据头cut一定长度的数据 
skb_reserve 增大headroom,减少tailroom,只能用于buffer为空时
skb_headroom headroom的大小
skb_tailroom tailroom的太小

alloc_skb() 分配一个sk_buff结构及buffer区域
kfree_slb() reference 减一,并且free skb和buffer如果不再有引用

dev_alloc_skb() 方便接收数据的sk_buff的分配函数
dev_free_skb()  

skb_shinfo() 获得和sk_buff 一块分配的struct skb_shared_info 

skb_clone() //复制sk_buff ,但是buffer不变 
pskb_copy()  //拷贝sk_buff和私有的头部,常用于需要修改sk_buff的头部时
skb_copy() //完全拷贝

skb_queue_head_init()
skb_queue_head()
skb_queue_tail()
skb_dequeue_head()
skb_dequeue_tail()
skb_queue_purge() //list 清空

skb_queue_walk() //遍历list用

在Linux2.6中,struct sk_buf承担了socket的输入输出的传输缓存的任务。
首先,还是先看struct socket的定义

/**
 * struct socket - general BSD socket
 * @state: socket state (%SS_CONNECTED, etc)
 * @type: socket type (%SOCK_STREAM, etc)
 * @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 * @ops: protocol specific socket operations
 * @file: File back pointer for gc
 * @sk: internal networking protocol agnostic socket representation
 * @wq: wait queue for several uses
 */

struct socket {
    socket_state        state;

    kmemcheck_bitfield_begin(type);
    short            type;
    kmemcheck_bitfield_end(type);

    unsigned long        flags;

    struct socket_wq    *wq;

    struct file        *file;
    struct sock        *sk;
    const struct proto_ops    *ops;
};

代码中的注释对于每一个变量说的都很清楚——看到这里,我先感叹一下,linux2.6的结构体的注释比老版本要清楚的多。到目前为止,我所看到的关键的结构体,都有清晰的注释。我们可以看出struct socket中的sock变量,是socket变量的工作核心。
那么现在跳转到struct sock的定义处。由于struct sock的定义过长,所以只展示一部分。

struct sock {
    /*
     * Now struct inet_timewait_sock also uses sock_common, so please just
     * don't add nothing before this first member (__sk_common) --acme
     */

    struct sock_common    __sk_common;
    /* skip some codes */
    int sk_rcvbuf;
    /* skip some codes */
    int sk_sndbuf;
    struct sk_buff_head    sk_receive_queue;
    struct sk_buff_head    sk_write_queue;

}

其中,sk_rcvbuf和sk_sendbuf分别是接收和发送缓存的字节数。 而struct sk_buff_head的定义如下:

struct sk_buff_head {
    /* These two members must be first. */
    struct sk_buff    *next;
    struct sk_buff    *prev;

    __u32        qlen;
    spinlock_t    lock;
};

可以看出socket的接收和发送缓存是使用一个双链表将sk_buff组织起来的。