Linux 协议栈分析 socket——笔记

通过查看socket的帮助手册可以得到socket的定义形式为：

int socket(int domain, int type, int protocol);

domain的有效值如下:

       AF_UNIX, AF_LOCAL   Local communication              unix(7)

       AF_INET             IPv4 Internet protocols          ip(7)

       AF_INET6            IPv6 Internet protocols          ipv6(7)

       AF_IPX              IPX - Novell protocols

       AF_NETLINK          Kernel user interface device     netlink(7)

       AF_X25              ITU-T X.25 / ISO-8208 protocol   x25(7)

       AF_AX25             Amateur radio AX.25 protocol

       AF_ATMPVC           Access to raw ATM PVCs

       AF_APPLETALK        Appletalk                        ddp(7)

       AF_PACKET           Low level packet interface       packet(7)

而type的取值范围为:

       SOCK_STREAM     Provides sequenced, reliable, two-way, connection-based

                       byte  streams.  An out-of-band data transmission mecha‐

                       nism may be supported.

       SOCK_DGRAM      Supports datagrams (connectionless, unreliable messages

                       of a fixed maximum length).

       SOCK_SEQPACKET  Provides  a  sequenced,  reliable,  two-way connection-

                       based data transmission path  for  datagrams  of  fixed

                       maximum  length;  a  consumer  is  required  to read an

                       entire packet with each input system call.

       SOCK_RAW        Provides raw network protocol access.

       SOCK_RDM        Provides a reliable datagram layer that does not  guar‐

                       antee ordering.

       SOCK_PACKET     Obsolete  and  should  not be used in new programs; see

                       packet(7).

而在内核版本2.6.27之后，还可以通过设定相应二进制为1来设定socket的类型。即type可以在取上述值后再按位OR以下值。这一点可以在socket进入内核的源代码中得到证实。

       SOCK_NONBLOCK   Set  the  O_NONBLOCK  file  status flag on the new open

                       file description.  Using this flag saves extra calls to

                       fcntl(2) to achieve the same result.

       SOCK_CLOEXEC    Set the close-on-exec (FD_CLOEXEC) flag on the new file

                       descriptor.  See the description of the O_CLOEXEC  flag

                       in open(2) for reasons why this may be useful.

protocol一般为0。
socket函数经过前述的方式进入内核后会最终由sys_socket(net/socket.c)来完成。

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

  int retval;

  struct socket *sock;

  int flags;

  /* Check the SOCK_* constants for consistency.  */

  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

  flags = type & ~SOCK_TYPE_MASK;

  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

    return -EINVAL;

  type &= SOCK_TYPE_MASK;

  if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

  retval = sock_create(family, type, protocol, &sock);

  if (retval < 0)

    goto out;

  retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

  if (retval < 0)

    goto out_release;

out:

  /* It may be already another descriptor 8) Not kernel problem. */

  return retval;

out_release:

  sock_release(sock);

  return retval;

1278~1281行就是取得type的值并检查是否合法。
我们知道socket对于用户的而言就是一个已经打开的特殊文件，而内核则为插口(socket)定义了一种特殊的文件类型形成特殊的文件系统sockfs(net/socket.c)，而sys_socket中调用的两个函数sock_create和sock_map_fd，可以看到这两个函数都共用一个sock参数，这便是为内核管理socket用的，而sock_map_fd明显是为用户提供已经打开的文件号。
sockfs的建立过程省略，sockfs的定义如下：

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {

  .name =    "sockfs",

  .get_sb =  sockfs_get_sb,

  .kill_sb =  kill_anon_super,

};

而所谓的通过socket函数创建一个插口，就是在sockfs中创建一个特殊文件，或者说是一个结点，并为实现相应插口功能建立一起一整套数据结构。所以首先就通过sock_create创建一个struct socket数据结构，然后通过sock_map_fd映射到一个已经打开的文件上。在分析sock_create和sock_map_fd之前先看看struct socket的定义(include/linux/net.h)：

/**

 *  struct socket - general BSD socket

 *  @state: socket state (%SS_CONNECTED, etc)

 *  @type: socket type (%SOCK_STREAM, etc)

 *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)

 *  @ops: protocol specific socket operations

 *  @fasync_list: Asynchronous wake up list

 *  @file: File back pointer for gc

 *  @sk: internal networking protocol agnostic socket representation

 *  @wait: wait queue for several uses

*/

struct socket {

  socket_state    state;

  kmemcheck_bitfield_begin(type);

  short      type;

  kmemcheck_bitfield_end(type);

  unsigned long    flags;

/*

   * Please keep fasync_list & wait fields in the same cache line

*/

  struct fasync_struct  *fasync_list;

  wait_queue_head_t  wait;

  struct file    *file;

  struct sock    *sk;

  const struct proto_ops  *ops;

};

struct proto_ops {

  int    family;

  struct module  *owner;

  int    (*release)   (struct socket *sock);

  int    (*bind)       (struct socket *sock,

              struct sockaddr *myaddr,

              int sockaddr_len);

  int    (*connect)   (struct socket *sock,

              struct sockaddr *vaddr,

              int sockaddr_len, int flags);

  int    (*socketpair)(struct socket *sock1,

              struct socket *sock2);

  int    (*accept)    (struct socket *sock,

              struct socket *newsock, int flags);

  int    (*getname)   (struct socket *sock,

              struct sockaddr *addr,

              int *sockaddr_len, int peer);

  unsigned int  (*poll)       (struct file *file, struct socket *sock,

              struct poll_table_struct *wait);

  int    (*ioctl)     (struct socket *sock, unsigned int cmd,

              unsigned long arg);

  int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,

              unsigned long arg);

  int    (*listen)    (struct socket *sock, int len);

  int    (*shutdown)  (struct socket *sock, int flags);

  int    (*setsockopt)(struct socket *sock, int level,

              int optname, char __user *optval, unsigned int optlen);

  int    (*getsockopt)(struct socket *sock, int level,

              int optname, char __user *optval, int __user *optlen);

  int    (*compat_setsockopt)(struct socket *sock, int level,

              int optname, char __user *optval, unsigned int optlen);

  int    (*compat_getsockopt)(struct socket *sock, int level,

              int optname, char __user *optval, int __user *optlen);

  int    (*sendmsg)   (struct kiocb *iocb, struct socket *sock,

              struct msghdr *m, size_t total_len);

  int    (*recvmsg)   (struct kiocb *iocb, struct socket *sock,

              struct msghdr *m, size_t total_len,

              int flags);

  int    (*mmap)       (struct file *file, struct socket *sock,

              struct vm_area_struct * vma);

  ssize_t    (*sendpage)  (struct socket *sock, struct page *page,

              int offset, size_t size, int flags);

  ssize_t   (*splice_read)(struct socket *sock,  loff_t *ppos,

               struct pipe_inode_info *pipe, size_t len, unsigned int flags);

};

接下来分析sock_create(net/socket.c)，sock_create会调用__sock_create。

static int __sock_create(struct net *net, int family, int type, int protocol,

       struct socket **res, int kern)

  int err;

  struct socket *sock;

  const struct net_proto_family *pf;

/*

   *      Check protocol is in range

*/

  if (family < 0 || family >= NPROTO)

    return -EAFNOSUPPORT;

  if (type < 0 || type >= SOCK_MAX)

    return -EINVAL;

  /* Compatibility.

     This uglymoron is moved from INET layer to here to avoid

     deadlock in module load.

*/

  if (family == PF_INET && type == SOCK_PACKET) {

    static int warned;

    if (!warned) {

      warned = 1;

      printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",

             current->comm);

    family = PF_PACKET;

  err = security_socket_create(family, type, protocol, kern);

  if (err)

    return err;

/*

   *  Allocate the socket and allow the family to set things up. if

   *  the protocol is 0, the family is instructed to select an appropriate

   *  default.

*/

  sock = sock_alloc();

  if (!sock) {

    if (net_ratelimit())

      printk(KERN_WARNING "socket: no more sockets\n");

    return -ENFILE;  /* Not exactly a match, but its the

           closest posix thing */

  sock->type = type;

#ifdef CONFIG_MODULES

  /* Attempt to load a protocol module if the find failed.

   * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user

   * requested real, full-featured networking support upon configuration.

   * Otherwise module support will break!

*/

  if (net_families[family] == NULL)

    request_module("net-pf-%d", family);

#endif

  rcu_read_lock();

  pf = rcu_dereference(net_families[family]);

  err = -EAFNOSUPPORT;

  if (!pf)

    goto out_release;

/*

   * We will call the ->create function, that possibly is in a loadable

   * module, so we have to bump that loadable module refcnt first.

*/

  if (!try_module_get(pf->owner))

    goto out_release;

  /* Now protected by module ref count */

  rcu_read_unlock();

  err = pf->create(net, sock, protocol);

  if (err < 0)

    goto out_module_put;

/*

   * Now to bump the refcnt of the [loadable] module that owns this

   * socket at sock_release time we decrement its refcnt.

*/

  if (!try_module_get(sock->ops->owner))

    goto out_module_busy;

/*

   * Now that we're done with the ->create function, the [loadable]

   * module can have its refcnt decremented

*/

  module_put(pf->owner);

  err = security_socket_post_create(sock, family, type, protocol, kern);

  if (err)

    goto out_sock_release;

  *res = sock;

  return 0;

out_module_busy:

  err = -EAFNOSUPPORT;

out_module_put:

  sock->ops = NULL;

  module_put(pf->owner);

out_sock_release:

  sock_release(sock);

  return err;

out_release:

  rcu_read_unlock();

  goto out_sock_release;

int sock_create(int family, int type, int protocol, struct socket **res)

  return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);

1150~1171行做的很简单，不过是参数检查。
接下来的security_socket_create以及后面的security_socket_post_create都定义在/include/linux/security.h中定义的空函数

static inline int security_socket_create(int family, int type,

           int protocol, int kern)

  return 0;

static inline int security_socket_post_create(struct socket *sock,

                int family,

                int type,

                int protocol, int kern)

  return 0;

1182行的sock_alloc的代码如下：

static struct socket *sock_alloc(void)

  struct inode *inode;

  struct socket *sock;

  inode = new_inode(sock_mnt->mnt_sb);

  if (!inode)

    return NULL;

  sock = SOCKET_I(inode);

  kmemcheck_annotate_bitfield(sock, type);

  inode->i_mode = S_IFSOCK | S_IRWXUGO;

  inode->i_uid = current_fsuid();

  inode->i_gid = current_fsgid();

  percpu_add(sockets_in_use, 1);

  return sock;

其中的new_inode是在/fs/inode.c中定义

static struct inode *alloc_inode(struct super_block *sb)

  struct inode *inode;

  if (sb->s_op->alloc_inode)

    inode = sb->s_op->alloc_inode(sb);

  else

    inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);

  if (!inode)

    return NULL;

  if (unlikely(inode_init_always(sb, inode))) {

    if (inode->i_sb->s_op->destroy_inode)

      inode->i_sb->s_op->destroy_inode(inode);

    else

      kmem_cache_free(inode_cachep, inode);

    return NULL;

  return inode;

struct inode *new_inode(struct super_block *sb)

/*

   * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW

   * error if st_ino won't fit in target struct field. Use 32bit counter

   * here to attempt to avoid that.

*/

  static unsigned int last_ino;

  struct inode *inode;

  spin_lock_prefetch(&inode_lock);

  inode = alloc_inode(sb);

  if (inode) {

    spin_lock(&inode_lock);

    __inode_add_to_lists(sb, NULL, inode);

    inode->i_ino = ++last_ino;

    inode->i_state = 0;

    spin_unlock(&inode_lock);

  return inode;

EXPORT_SYMBOL(new_inode);

可以看出new_inode会调用alloc_inode分配inode，而alloc_inode会调用sockfs在VFS中注册的相应的函数来处理，那这个函数是什么呢？先来看一看/net/socket.c

static struct inode *sock_alloc_inode(struct super_block *sb)

  struct socket_alloc *ei;

  ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);

  if (!ei)

    return NULL;

  init_waitqueue_head(&ei->socket.wait);

  ei->socket.fasync_list = NULL;

  ei->socket.state = SS_UNCONNECTED;

  ei->socket.flags = 0;

  ei->socket.ops = NULL;

  ei->socket.sk = NULL;

  ei->socket.file = NULL;

  return &ei->vfs_inode;

static const struct super_operations sockfs_ops = {

  .alloc_inode =  sock_alloc_inode,

  .destroy_inode =sock_destroy_inode,

  .statfs =  simple_statfs,

};

为帮助理解列出struct socket_alloc 结构体的定义。

struct socket_alloc {

  struct socket socket;

  struct inode vfs_inode;

};

static inline struct socket *SOCKET_I(struct inode *inode)

  return &container_of(inode, struct socket_alloc, vfs_inode)->socket;

可以看到这个函数其实就是sock_alloc_inode,该函数分配了一个struct socket_alloc类型的结构体，然后返回这个结构体中的一个成员变量vfs_inode的地址，可以看出来这就是一个inode结构。然后就回到了sock_alloc函数的第489行，通过SOCKET_I获得与vfs_inode同在socket_alloc结构体中的成员socket的地址。然后程序返回到__sock_create的1190行。

1192开始的代码说明，如果编译内核开启了CONFIG_MODULES也就是内核模块的选项就先检查内核现在是否有支持由family(就是domain)所指定的网域的代码，如果没有则通过request_module来安装。

说到这里就先看看1204行的net_families这个数组，很明显它是控制和操作各个网域的一个控制结构体的集合，通过变量pf可以发现它的类型为struct net_proto_family(/include/linux/net.h)

struct net_proto_family {

  int    family;

  int    (*create)(struct net *net, struct socket *sock, int protocol);

  struct module  *owner;

};

然后1219行通过pf调用相应网域的create的函数，可以很简单地得出对于AF_UNIX, AF_INET, AF_INET6, AF_PACKET这些所对应的create函数肯定不一样。接下来我们以AF_INET为例说明。在/net/ipv4/af_inet.c中

static struct net_proto_family inet_family_ops = {

  .family = PF_INET,

  .create = inet_create,

  .owner  = THIS_MODULE,

};

由936可以得出对于AF_inet其create函数为inet_create,定义于同一文件中。

static int inet_create(struct net *net, struct socket *sock, int protocol)

  struct sock *sk;

  struct inet_protosw *answer;

  struct inet_sock *inet;

  struct proto *answer_prot;

  unsigned char answer_flags;

  char answer_no_check;

  int try_loading_module = 0;

  int err;

  if (unlikely(!inet_ehash_secret))

    if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)

      build_ehash_secret();

  sock->state = SS_UNCONNECTED;

  /* Look for the requested type/protocol pair. */

lookup_protocol:

  err = -ESOCKTNOSUPPORT;

  rcu_read_lock();

  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

    err = 0;

    /* Check the non-wild match. */

    if (protocol == answer->protocol) {

      if (protocol != IPPROTO_IP)

        break;

    } else {

      /* Check for the two wild cases. */

      if (IPPROTO_IP == protocol) {

        protocol = answer->protocol;

        break;

      if (IPPROTO_IP == answer->protocol)

        break;

    err = -EPROTONOSUPPORT;

  if (unlikely(err)) {

    if (try_loading_module < 2) {

      rcu_read_unlock();

/*

       * Be more specific, e.g. net-pf-2-proto-132-type-1

       * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)

*/

      if (++try_loading_module == 1)

        request_module("net-pf-%d-proto-%d-type-%d",

                 PF_INET, protocol, sock->type);

/*

       * Fall back to generic, e.g. net-pf-2-proto-132

       * (net-pf-PF_INET-proto-IPPROTO_SCTP)

*/

      else

        request_module("net-pf-%d-proto-%d",

                 PF_INET, protocol);

      goto lookup_protocol;

    } else

      goto out_rcu_unlock;

  err = -EPERM;

  if (answer->capability > 0 && !capable(answer->capability))

    goto out_rcu_unlock;

  err = -EAFNOSUPPORT;

  if (!inet_netns_ok(net, protocol))

    goto out_rcu_unlock;

  sock->ops = answer->ops;

  answer_prot = answer->prot;

  answer_no_check = answer->no_check;

  answer_flags = answer->flags;

  rcu_read_unlock();

  WARN_ON(answer_prot->slab == NULL);

  err = -ENOBUFS;

  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);

  if (sk == NULL)

    goto out;

  err = 0;

  sk->sk_no_check = answer_no_check;

  if (INET_PROTOSW_REUSE & answer_flags)

    sk->sk_reuse = 1;

  inet = inet_sk(sk);

  inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

  if (SOCK_RAW == sock->type) {

    inet->num = protocol;

    if (IPPROTO_RAW == protocol)

      inet->hdrincl = 1;

  if (ipv4_config.no_pmtu_disc)

    inet->pmtudisc = IP_PMTUDISC_DONT;

  else

    inet->pmtudisc = IP_PMTUDISC_WANT;

  inet->id = 0;

  sock_init_data(sock, sk);

  sk->sk_destruct     = inet_sock_destruct;

  sk->sk_protocol     = protocol;

  sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

  inet->uc_ttl  = -1;

  inet->mc_loop  = 1;

  inet->mc_ttl  = 1;

  inet->mc_all  = 1;

  inet->mc_index  = 0;

  inet->mc_list  = NULL;

  sk_refcnt_debug_inc(sk);

  if (inet->num) {

    /* It assumes that any protocol which allows

     * the user to assign a number at socket

     * creation time automatically

     * shares.

*/

    inet->sport = htons(inet->num);

    /* Add to protocol hash chains. */

    sk->sk_prot->hash(sk);

  if (sk->sk_prot->init) {

    err = sk->sk_prot->init(sk);

    if (err)

      sk_common_release(sk);

out:

  return err;

out_rcu_unlock:

  rcu_read_unlock();

  goto out;

每283到325就是通过type和protocol从inetsw中找出对应的struct inet_protosw的结构体。inetsw是定义于(net/ipv4/af_inet.c)中定义的

/* The inetsw table contains everything that inet_create needs to

 * build a new socket.

*/

static struct list_head inetsw[SOCK_MAX];

static DEFINE_SPINLOCK(inetsw_lock);

而对于struct inet_protosw是在/include/net/protocol.h中定义

/* This is used to register socket interfaces for IP protocols.  */

struct inet_protosw {

  struct list_head list;

        /* These two fields form the lookup key.  */

  unsigned short   type;     /* This is the 2nd argument to socket(2). */

  unsigned short   protocol; /* This is the L4 protocol number.  */

  struct proto   *prot;

  const struct proto_ops *ops;

  int              capability; /* Which (if any) capability do

              * we need to use this socket

              * interface?

*/

  char             no_check;   /* checksum on rcv/xmit/none? */

  unsigned char   flags;      /* See INET_PROTOSW_* below.  */

};

inetsw其实是就是Linux内核的典型的组织链表结构的一个数组，是按type组织的。inetsw是通过inet_register_protosw初始化的

void inet_register_protosw(struct inet_protosw *p)

  struct list_head *lh;

  struct inet_protosw *answer;

  int protocol = p->protocol;

  struct list_head *last_perm;

  spin_lock_bh(&inetsw_lock);

  if (p->type >= SOCK_MAX)

    goto out_illegal;

  /* If we are trying to override a permanent protocol, bail. */

  answer = NULL;

  last_perm = &inetsw[p->type];

  list_for_each(lh, &inetsw[p->type]) {

    answer = list_entry(lh, struct inet_protosw, list);

    /* Check only the non-wild match. */

    if (INET_PROTOSW_PERMANENT & answer->flags) {

      if (protocol == answer->protocol)

        break;

      last_perm = lh;

    answer = NULL;

  if (answer)

    goto out_permanent;

  /* Add the new entry after the last permanent entry if any, so that

   * the new entry does not override a permanent entry when matched with

   * a wild-card protocol. But it is allowed to override any existing

   * non-permanent entry.  This means that when we remove this entry, the

   * system automatically returns to the old behavior.

*/

  list_add_rcu(&p->list, last_perm);

out:

  spin_unlock_bh(&inetsw_lock);

  return;

out_permanent:

  printk(KERN_ERR "Attempt to override permanent protocol %d.\n",

         protocol);

  goto out;

out_illegal:

  printk(KERN_ERR

         "Ignoring attempt to register invalid socket type %d.\n",

         p->type);

  goto out;

EXPORT_SYMBOL(inet_register_protosw);

对于inet_register_protosw的调用是在inet_init中的第1593行进行的。

static int __init inet_init(void)

  struct sk_buff *dummy_skb;

  struct inet_protosw *q;

  struct list_head *r;

  int rc = -EINVAL;

  BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));

  rc = proto_register(&tcp_prot, 1);

  if (rc)

    goto out;

  rc = proto_register(&udp_prot, 1);

  if (rc)

    goto out_unregister_tcp_proto;

  rc = proto_register(&raw_prot, 1);

  if (rc)

    goto out_unregister_udp_proto;

/*

   *  Tell SOCKET that we are alive...

*/

  (void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL

  ip_static_sysctl_init();

#endif

/*

   *  Add all the base protocols.

*/

  if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)

    printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");

  if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)

    printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");

  if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)

    printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");

#ifdef CONFIG_IP_MULTICAST

  if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)

    printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");

#endif

  /* Register the socket-side information for inet_create. */

  for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)

    INIT_LIST_HEAD(r);

  for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)

    inet_register_protosw(q);

/*

   *  Set the ARP module up

*/

  arp_init();

/*

   *  Set the IP module up

*/

  ip_init();

  tcp_v4_init();

  /* Setup TCP slab cache for open requests. */

  tcp_init();

  /* Setup UDP memory threshold */

  udp_init();

  /* Add UDP-Lite (RFC 3828) */

  udplite4_register();

/*

   *  Set the ICMP layer up

*/

  if (icmp_init() < 0)

    panic("Failed to create the ICMP control socket.\n");

/*

   *  Initialise the multicast router

*/

#if defined(CONFIG_IP_MROUTE)

  if (ip_mr_init())

    printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");

#endif

/*

   *  Initialise per-cpu ipv4 mibs

*/

  if (init_ipv4_mibs())

    printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");

  ipv4_proc_init();

  ipfrag_init();

  dev_add_pack(&ip_packet_type);

  rc = 0;

out:

  return rc;

out_unregister_udp_proto:

  proto_unregister(&udp_prot);

out_unregister_tcp_proto:

  proto_unregister(&tcp_prot);

  goto out;

fs_initcall(inet_init);

从1592行可以看出初始化inetsw是用的inetsw_array数组，再看看inetsw_array数组。

const struct proto_ops inet_stream_ops = {

  .family       = PF_INET,

  .owner       = THIS_MODULE,

  .release     = inet_release,

  .bind       = inet_bind,

  .connect     = inet_stream_connect,

  .socketpair     = sock_no_socketpair,

  .accept       = inet_accept,

  .getname     = inet_getname,

  .poll       = tcp_poll,

  .ioctl       = inet_ioctl,

  .listen       = inet_listen,

  .shutdown     = inet_shutdown,

  .setsockopt     = sock_common_setsockopt,

  .getsockopt     = sock_common_getsockopt,

  .sendmsg     = tcp_sendmsg,

  .recvmsg     = sock_common_recvmsg,

  .mmap       = sock_no_mmap,

  .sendpage     = tcp_sendpage,

  .splice_read     = tcp_splice_read,

#ifdef CONFIG_COMPAT

  .compat_setsockopt = compat_sock_common_setsockopt,

  .compat_getsockopt = compat_sock_common_getsockopt,

#endif

};

EXPORT_SYMBOL(inet_stream_ops);

const struct proto_ops inet_dgram_ops = {

  .family       = PF_INET,

  .owner       = THIS_MODULE,

  .release     = inet_release,

  .bind       = inet_bind,

  .connect     = inet_dgram_connect,

  .socketpair     = sock_no_socketpair,

  .accept       = sock_no_accept,

  .getname     = inet_getname,

  .poll       = udp_poll,

  .ioctl       = inet_ioctl,

  .listen       = sock_no_listen,

  .shutdown     = inet_shutdown,

  .setsockopt     = sock_common_setsockopt,

  .getsockopt     = sock_common_getsockopt,

  .sendmsg     = inet_sendmsg,

  .recvmsg     = sock_common_recvmsg,

  .mmap       = sock_no_mmap,

  .sendpage     = inet_sendpage,

#ifdef CONFIG_COMPAT

  .compat_setsockopt = compat_sock_common_setsockopt,

  .compat_getsockopt = compat_sock_common_getsockopt,

#endif

};

EXPORT_SYMBOL(inet_dgram_ops);

/*

 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without

 * udp_poll

*/

static const struct proto_ops inet_sockraw_ops = {

  .family       = PF_INET,

  .owner       = THIS_MODULE,

  .release     = inet_release,

  .bind       = inet_bind,

  .connect     = inet_dgram_connect,

  .socketpair     = sock_no_socketpair,

  .accept       = sock_no_accept,

  .getname     = inet_getname,

  .poll       = datagram_poll,

  .ioctl       = inet_ioctl,

  .listen       = sock_no_listen,

  .shutdown     = inet_shutdown,

  .setsockopt     = sock_common_setsockopt,

  .getsockopt     = sock_common_getsockopt,

  .sendmsg     = inet_sendmsg,

  .recvmsg     = sock_common_recvmsg,

  .mmap       = sock_no_mmap,

  .sendpage     = inet_sendpage,

#ifdef CONFIG_COMPAT

  .compat_setsockopt = compat_sock_common_setsockopt,

  .compat_getsockopt = compat_sock_common_getsockopt,

#endif

};

static struct net_proto_family inet_family_ops = {

  .family = PF_INET,

  .create = inet_create,

  .owner  = THIS_MODULE,

};

/* Upon startup we insert all the elements in inetsw_array[] into

 * the linked list inetsw.

*/

static struct inet_protosw inetsw_array[] =

    .type =       SOCK_STREAM,

    .protocol =   IPPROTO_TCP,

    .prot =       &tcp_prot,

    .ops =        &inet_stream_ops,

    .capability = -1,

    .no_check =   0,

    .flags =      INET_PROTOSW_PERMANENT |

            INET_PROTOSW_ICSK,

},

    .type =       SOCK_DGRAM,

    .protocol =   IPPROTO_UDP,

    .prot =       &udp_prot,

    .ops =        &inet_dgram_ops,

    .capability = -1,

    .no_check =   UDP_CSUM_DEFAULT,

    .flags =      INET_PROTOSW_PERMANENT,

},

         .type =       SOCK_RAW,

         .protocol =   IPPROTO_IP,  /* wild card */

         .prot =       &raw_prot,

         .ops =        &inet_sockraw_ops,

         .capability = CAP_NET_RAW,

         .no_check =   UDP_CSUM_DEFAULT,

         .flags =      INET_PROTOSW_REUSE,

};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

假设我们分析ipv4中的TCP协议，其它协议也可以参照分析。现在回到inet_create函数，这个函数最重要的一行就是335，这一行的作用就是初始化套接口socket所应该对应的操作函数。例如如果用socket(AF_INET, SOCK_STREAM, 0);创建套接字，则内核就会在这里为这个套接字关联上相应的TCP的操作函数集inet_stream_ops，以后在这个套接字上的数据的各种操作如accept listen bind send recv都会通过这些函数完成。
接下来在inet_create中的344后就是分配一个struct sock结构体，这个sock结构和socket结构是一一对应的，两个结构各有一个成员指向对方。struct sock是在include/net/sock.h中定义，它有两个非常重要的成员sk_receive_queue和sk_write_queue。还有两个成员sk_rcvbuf,sk_sndbuf分别代表接收和发送缓冲区的大小，默认是32767字节，是在sock_init_data(net/core/sock.c)中初始化的。另外对于有连接模式可能要求超时重传，所以还有一个sk_timer的定时队列。

/**

  *  struct sock - network layer representation of sockets

  *  @__sk_common: shared layout with inet_timewait_sock

  *  @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN

  *  @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings

  *  @sk_lock:  synchronizer

  *  @sk_rcvbuf: size of receive buffer in bytes

  *  @sk_sleep: sock wait queue

  *  @sk_dst_cache: destination cache

  *  @sk_dst_lock: destination cache lock

  *  @sk_policy: flow policy

  *  @sk_rmem_alloc: receive queue bytes committed

  *  @sk_receive_queue: incoming packets

  *  @sk_wmem_alloc: transmit queue bytes committed

  *  @sk_write_queue: Packet sending queue

  *  @sk_async_wait_queue: DMA copied packets

  *  @sk_omem_alloc: "o" is "option" or "other"

  *  @sk_wmem_queued: persistent queue size

  *  @sk_forward_alloc: space allocated forward

  *  @sk_allocation: allocation mode

  *  @sk_sndbuf: size of send buffer in bytes

  *  @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,

  *       %SO_OOBINLINE settings, %SO_TIMESTAMPING settings

  *  @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets

  *  @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)

  *  @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)

  *  @sk_gso_max_size: Maximum GSO segment size to build

  *  @sk_lingertime: %SO_LINGER l_linger setting

  *  @sk_backlog: always used with the per-socket spinlock held

  *  @sk_callback_lock: used with the callbacks in the end of this struct

  *  @sk_error_queue: rarely used

  *  @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,

  *        IPV6_ADDRFORM for instance)

  *  @sk_err: last error

  *  @sk_err_soft: errors that don't cause failure but are the cause of a

  *          persistent failure not just 'timed out'

  *  @sk_drops: raw/udp drops counter

  *  @sk_ack_backlog: current listen backlog

  *  @sk_max_ack_backlog: listen backlog set in listen()

  *  @sk_priority: %SO_PRIORITY setting

  *  @sk_type: socket type (%SOCK_STREAM, etc)

  *  @sk_protocol: which protocol this socket belongs in this network family

  *  @sk_peercred: %SO_PEERCRED setting

  *  @sk_rcvlowat: %SO_RCVLOWAT setting

  *  @sk_rcvtimeo: %SO_RCVTIMEO setting

  *  @sk_sndtimeo: %SO_SNDTIMEO setting

  *  @sk_filter: socket filtering instructions

  *  @sk_protinfo: private area, net family specific, when not using slab

  *  @sk_timer: sock cleanup timer

  *  @sk_stamp: time stamp of last packet received

  *  @sk_socket: Identd and reporting IO signals

  *  @sk_user_data: RPC layer private data

  *  @sk_sndmsg_page: cached page for sendmsg

  *  @sk_sndmsg_off: cached offset for sendmsg

  *  @sk_send_head: front of stuff to transmit

  *  @sk_security: used by security modules

  *  @sk_mark: generic packet mark

  *  @sk_write_pending: a write to stream socket waits to start

  *  @sk_state_change: callback to indicate change in the state of the sock

  *  @sk_data_ready: callback to indicate there is data to be processed

  *  @sk_write_space: callback to indicate there is bf sending space available

  *  @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)

  *  @sk_backlog_rcv: callback to process the backlog

  *  @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0

*/

struct sock {

/*

   * Now struct inet_timewait_sock also uses sock_common, so please just

   * don't add nothing before this first member (__sk_common) --acme

*/

  struct sock_common  __sk_common;

#define sk_node      __sk_common.skc_node

#define sk_nulls_node    __sk_common.skc_nulls_node

#define sk_refcnt    __sk_common.skc_refcnt

#define sk_copy_start    __sk_common.skc_hash

#define sk_hash      __sk_common.skc_hash

#define sk_family    __sk_common.skc_family

#define sk_state    __sk_common.skc_state

#define sk_reuse    __sk_common.skc_reuse

#define sk_bound_dev_if    __sk_common.skc_bound_dev_if

#define sk_bind_node    __sk_common.skc_bind_node

#define sk_prot      __sk_common.skc_prot

#define sk_net      __sk_common.skc_net

  kmemcheck_bitfield_begin(flags);

  unsigned int    sk_shutdown  : 2,

        sk_no_check  : 2,

        sk_userlocks : 4,

        sk_protocol  : 8,

        sk_type      : 16;

  kmemcheck_bitfield_end(flags);

  int      sk_rcvbuf;

  socket_lock_t    sk_lock;

/*

   * The backlog queue is special, it is always used with

   * the per-socket spinlock held and requires low latency

   * access. Therefore we special case it's implementation.

*/

  struct {

    struct sk_buff *head;

    struct sk_buff *tail;

  } sk_backlog;

  wait_queue_head_t  *sk_sleep;

  struct dst_entry  *sk_dst_cache;

#ifdef CONFIG_XFRM

  struct xfrm_policy  *sk_policy[2];

#endif

  rwlock_t    sk_dst_lock;

  atomic_t    sk_rmem_alloc;

  atomic_t    sk_wmem_alloc;

  atomic_t    sk_omem_alloc;

  int      sk_sndbuf;

  struct sk_buff_head  sk_receive_queue;

  struct sk_buff_head  sk_write_queue;

#ifdef CONFIG_NET_DMA

  struct sk_buff_head  sk_async_wait_queue;

#endif

  int      sk_wmem_queued;

  int      sk_forward_alloc;

  gfp_t      sk_allocation;

  int      sk_route_caps;

  int      sk_gso_type;

  unsigned int    sk_gso_max_size;

  int      sk_rcvlowat;

  unsigned long     sk_flags;

  unsigned long          sk_lingertime;

  struct sk_buff_head  sk_error_queue;

  struct proto    *sk_prot_creator;

  rwlock_t    sk_callback_lock;

  int      sk_err,

        sk_err_soft;

  atomic_t    sk_drops;

  unsigned short    sk_ack_backlog;

  unsigned short    sk_max_ack_backlog;

  __u32      sk_priority;

  struct ucred    sk_peercred;

  long      sk_rcvtimeo;

  long      sk_sndtimeo;

  struct sk_filter        *sk_filter;

  void      *sk_protinfo;

  struct timer_list  sk_timer;

  ktime_t      sk_stamp;

  struct socket    *sk_socket;

  void      *sk_user_data;

  struct page    *sk_sndmsg_page;

  struct sk_buff    *sk_send_head;

  __u32      sk_sndmsg_off;

  int      sk_write_pending;

#ifdef CONFIG_SECURITY

  void      *sk_security;

#endif

  __u32      sk_mark;

  /* XXX 4 bytes hole on 64 bit */

  void      (*sk_state_change)(struct sock *sk);

  void      (*sk_data_ready)(struct sock *sk, int bytes);

  void      (*sk_write_space)(struct sock *sk);

  void      (*sk_error_report)(struct sock *sk);

    int      (*sk_backlog_rcv)(struct sock *sk,

              struct sk_buff *skb);

  void                    (*sk_destruct)(struct sock *sk);

};

在分析sk_alloc之前先分析一下answer_prot. answer_prot是struct proto类型(include/net/sock.h)

/* Networking protocol blocks we attach to sockets.

 * socket layer -> transport layer interface

 * transport -> network interface is defined by struct inet_proto

*/

struct proto {

  void      (*close)(struct sock *sk,

          long timeout);

  int      (*connect)(struct sock *sk,

                struct sockaddr *uaddr,

          int addr_len);

  int      (*disconnect)(struct sock *sk, int flags);

  struct sock *    (*accept) (struct sock *sk, int flags, int *err);

  int      (*ioctl)(struct sock *sk, int cmd,

           unsigned long arg);

  int      (*init)(struct sock *sk);

  void      (*destroy)(struct sock *sk);

  void      (*shutdown)(struct sock *sk, int how);

  int      (*setsockopt)(struct sock *sk, int level,

          int optname, char __user *optval,

          unsigned int optlen);

  int      (*getsockopt)(struct sock *sk, int level,

          int optname, char __user *optval,

          int __user *option);

#ifdef CONFIG_COMPAT

  int      (*compat_setsockopt)(struct sock *sk,

          int level,

          int optname, char __user *optval,

          unsigned int optlen);

  int      (*compat_getsockopt)(struct sock *sk,

          int level,

          int optname, char __user *optval,

          int __user *option);

#endif

  int      (*sendmsg)(struct kiocb *iocb, struct sock *sk,

             struct msghdr *msg, size_t len);

  int      (*recvmsg)(struct kiocb *iocb, struct sock *sk,

             struct msghdr *msg,

          size_t len, int noblock, int flags,

          int *addr_len);

  int      (*sendpage)(struct sock *sk, struct page *page,

          int offset, size_t size, int flags);

  int      (*bind)(struct sock *sk,

          struct sockaddr *uaddr, int addr_len);

  int      (*backlog_rcv) (struct sock *sk,

            struct sk_buff *skb);

  /* Keeping track of sk's, looking them up, and port selection methods. */

  void      (*hash)(struct sock *sk);

  void      (*unhash)(struct sock *sk);

  int      (*get_port)(struct sock *sk, unsigned short snum);

  /* Keeping track of sockets in use */

#ifdef CONFIG_PROC_FS

  unsigned int    inuse_idx;

#endif

  /* Memory pressure */

  void      (*enter_memory_pressure)(struct sock *sk);

  atomic_t    *memory_allocated;  /* Current allocated memory. */

  struct percpu_counter  *sockets_allocated;  /* Current number of sockets. */

/*

   * Pressure flag: try to collapse.

   * Technical note: it is used by multiple contexts non atomically.

   * All the __sk_mem_schedule() is of this nature: accounting

   * is strict, actions are advisory and have some latency.

*/

  int      *memory_pressure;

  int      *sysctl_mem;

  int      *sysctl_wmem;

  int      *sysctl_rmem;

  int      max_header;

  struct kmem_cache  *slab;

  unsigned int    obj_size;

  int      slab_flags;

  struct percpu_counter  *orphan_count;

  struct request_sock_ops  *rsk_prot;

  struct timewait_sock_ops *twsk_prot;

  union {

    struct inet_hashinfo  *hashinfo;

    struct udp_table  *udp_table;

    struct raw_hashinfo  *raw_hash;

  } h;

  struct module    *owner;

  char      name[32];

  struct list_head  node;

#ifdef SOCK_REFCNT_DEBUG

  atomic_t    socks;

#endif

};

假设分析的是TCP协议，则通过336行的赋值从inetsw_array找到其prot成员变量为tcp_prot(net/ipv4/tcp_ipv4.h)。

struct proto tcp_prot = {

  .name      = "TCP",

  .owner      = THIS_MODULE,

  .close      = tcp_close,

  .connect    = tcp_v4_connect,

  .disconnect    = tcp_disconnect,

  .accept      = inet_csk_accept,

  .ioctl      = tcp_ioctl,

  .init      = tcp_v4_init_sock,

  .destroy    = tcp_v4_destroy_sock,

  .shutdown    = tcp_shutdown,

  .setsockopt    = tcp_setsockopt,

  .getsockopt    = tcp_getsockopt,

  .recvmsg    = tcp_recvmsg,

  .backlog_rcv    = tcp_v4_do_rcv,

  .hash      = inet_hash,

  .unhash      = inet_unhash,

  .get_port    = inet_csk_get_port,

  .enter_memory_pressure  = tcp_enter_memory_pressure,

  .sockets_allocated  = &tcp_sockets_allocated,

  .orphan_count    = &tcp_orphan_count,

  .memory_allocated  = &tcp_memory_allocated,

  .memory_pressure  = &tcp_memory_pressure,

  .sysctl_mem    = sysctl_tcp_mem,

  .sysctl_wmem    = sysctl_tcp_wmem,

  .sysctl_rmem    = sysctl_tcp_rmem,

  .max_header    = MAX_TCP_HEADER,

  .obj_size    = sizeof(struct tcp_sock),

  .slab_flags    = SLAB_DESTROY_BY_RCU,

  .twsk_prot    = &tcp_timewait_sock_ops,

  .rsk_prot    = &tcp_request_sock_ops,

  .h.hashinfo    = &tcp_hashinfo,

#ifdef CONFIG_COMPAT

  .compat_setsockopt  = compat_tcp_setsockopt,

  .compat_getsockopt  = compat_tcp_getsockopt,

#endif

};

通过tcp_prot的结构体对各成员的赋值可以发现并没有初始化，而obj_size被初始化为sizeof(struct tcp_sock)这一点可以在后面的分析中看到。接下来看inet_create的344行，即sk_alloc(net/ipv4/af_inet.c)。

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,

    int family)

  struct sock *sk;

  struct kmem_cache *slab;

  slab = prot->slab;

  if (slab != NULL) {

    sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);

    if (!sk)

      return sk;

    if (priority & __GFP_ZERO) {

/*

       * caches using SLAB_DESTROY_BY_RCU should let

       * sk_node.next un-modified. Special care is taken

       * when initializing object to zero.

*/

      if (offsetof(struct sock, sk_node.next) != 0)

        memset(sk, 0, offsetof(struct sock, sk_node.next));

      memset(&sk->sk_node.pprev, 0,

             prot->obj_size - offsetof(struct sock,

               sk_node.pprev));

  else

    sk = kmalloc(prot->obj_size, priority);

  if (sk != NULL) {

    kmemcheck_annotate_bitfield(sk, flags);

    if (security_sk_alloc(sk, family, priority))

      goto out_free;

    if (!try_module_get(prot->owner))

      goto out_free_sec;

  return sk;

out_free_sec:

  security_sk_free(sk);

out_free:

  if (slab != NULL)

    kmem_cache_free(slab, sk);

  else

    kfree(sk);

  return NULL;

/**

 *  sk_alloc - All socket objects are allocated here

 *  @net: the applicable net namespace

 *  @family: protocol family

 *  @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)

 *  @prot: struct proto associated with this new sock instance

*/

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,

          struct proto *prot)

  struct sock *sk;

  sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);

  if (sk) {

    sk->sk_family = family;

/*

     * See comment in struct sock definition to understand

     * why we need sk_prot_creator -acme

*/

    sk->sk_prot = sk->sk_prot_creator = prot;

    sock_lock_init(sk);

    sock_net_set(sk, get_net(net));

    atomic_set(&sk->sk_wmem_alloc, 1);

  return sk;

EXPORT_SYMBOL(sk_alloc);

很明显在sk_alloc中直接调用sk_prot_alloc来分配sock结构，在sk_prot_alloc中先判定slab是否为空(如前提示)，由于tcp_prot并未初始化slab所以直接分配obj_size大小即sizeof(struct tcp_sock)的空间，并返回空间类型为struct sock *的地址，但是又可以看到该空间的大小为sizeof(struct tcp_sock)，那就说明有两种情况：一、sizeof(struct tcp_sock) == sizeof(struct sock) 二、sizeof(struct tcp_sock) >= sizeof(struct sock) 。通过分析实际是第二种情况，通过列出一系列数据结构可以很明显地看出。
先来看struct tcp_sock结构的定义(include/linux/tcp.h)

struct tcp_sock {

  /* inet_connection_sock has to be the first member of tcp_sock */

  struct inet_connection_sock  inet_conn;

  u16  tcp_header_len;  /* Bytes of tcp header to send    */

  u16  xmit_size_goal_segs; /* Goal for segmenting output packets */

/*

 *  Header prediction flags

 *  0x5?10 << 16 + snd_wnd in net byte order

*/

  __be32  pred_flags;

/*

 *  RFC793 variables by their proper names. This means you can

 *  read the code and the spec side by side (and laugh ...)

 *  See RFC793 and RFC1122. The RFC writes these in capitals.

*/

   u32  rcv_nxt;  /* What we want to receive next   */

  u32  copied_seq;  /* Head of yet unread data    */

  u32  rcv_wup;  /* rcv_nxt on last window update sent  */

   u32  snd_nxt;  /* Next sequence we send    */

   u32  snd_una;  /* First byte we want an ack for  */

   u32  snd_sml;  /* Last byte of the most recently transmitted small packet */

  u32  rcv_tstamp;  /* timestamp of last received ACK (for keepalives) */

  u32  lsndtime;  /* timestamp of last sent data packet (for restart window) */

  /* Data for direct copy to user */

  struct {

    struct sk_buff_head  prequeue;

    struct task_struct  *task;

    struct iovec    *iov;

    int      memory;

    int      len;

#ifdef CONFIG_NET_DMA

    /* members for async copy */

    struct dma_chan    *dma_chan;

    int      wakeup;

    struct dma_pinned_list  *pinned_list;

    dma_cookie_t    dma_cookie;

#endif

  } ucopy;

  u32  snd_wl1;  /* Sequence for window update    */

  u32  snd_wnd;  /* The window we expect to receive  */

  u32  max_window;  /* Maximal window ever seen from peer  */

  u32  mss_cache;  /* Cached effective mss, not including SACKS */

  u32  window_clamp;  /* Maximal window to advertise    */

  u32  rcv_ssthresh;  /* Current window clamp      */

  u32  frto_highmark;  /* snd_nxt when RTO occurred */

  u16  advmss;    /* Advertised MSS      */

  u8  frto_counter;  /* Number of new acks after RTO */

  u8  nonagle;  /* Disable Nagle algorithm?             */

/* RTT measurement */

  u32  srtt;    /* smoothed round trip time << 3  */

  u32  mdev;    /* medium deviation      */

  u32  mdev_max;  /* maximal mdev for the last rtt period  */

  u32  rttvar;    /* smoothed mdev_max      */

  u32  rtt_seq;  /* sequence number to update rttvar  */

  u32  packets_out;  /* Packets which are "in flight"  */

  u32  retrans_out;  /* Retransmitted packets out    */

  u16  urg_data;  /* Saved octet of OOB data and control flags */

  u8  ecn_flags;  /* ECN status bits.      */

  u8  reordering;  /* Packet reordering metric.    */

  u32  snd_up;    /* Urgent pointer    */

  u8  keepalive_probes; /* num of allowed keep alive probes  */

/*

 *      Options received (usually on last packet, some only on SYN packets).

*/

  struct tcp_options_received rx_opt;

/*

 *  Slow start and congestion control (see also Nagle, and Karn & Partridge)

*/

   u32  snd_ssthresh;  /* Slow start size threshold    */

   u32  snd_cwnd;  /* Sending congestion window    */

  u32  snd_cwnd_cnt;  /* Linear increase counter    */

  u32  snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */

  u32  snd_cwnd_used;

  u32  snd_cwnd_stamp;

   u32  rcv_wnd;  /* Current receiver window    */

  u32  write_seq;  /* Tail(+1) of data held in tcp send buffer */

  u32  pushed_seq;  /* Last pushed seq, required to talk to windows */

  u32  lost_out;  /* Lost packets      */

  u32  sacked_out;  /* SACK'd packets      */

  u32  fackets_out;  /* FACK'd packets      */

  u32  tso_deferred;

  u32  bytes_acked;  /* Appropriate Byte Counting - RFC3465 */

  /* from STCP, retrans queue hinting */

  struct sk_buff* lost_skb_hint;

  struct sk_buff *scoreboard_skb_hint;

  struct sk_buff *retransmit_skb_hint;

  struct sk_buff_head  out_of_order_queue; /* Out of order segments go here */

  /* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */

  struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */

  struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

  struct tcp_sack_block recv_sack_cache[4];

  struct sk_buff *highest_sack;   /* highest skb with SACK received

           * (validity guaranteed only if

           * sacked_out > 0)

*/

  int     lost_cnt_hint;

  u32     retransmit_high;  /* L-bits may be on up to this seqno */

  u32  lost_retrans_low;  /* Sent seq after any rxmit (lowest) */

  u32  prior_ssthresh; /* ssthresh saved at recovery start  */

  u32  high_seq;  /* snd_nxt at onset of congestion  */

  u32  retrans_stamp;  /* Timestamp of the last retransmit,

         * also used in SYN-SENT to remember stamp of

         * the first SYN. */

  u32  undo_marker;  /* tracking retrans started here. */

  int  undo_retrans;  /* number of undoable retransmissions. */

  u32  total_retrans;  /* Total retransmits for entire connection */

  u32  urg_seq;  /* Seq of received urgent pointer */

  unsigned int    keepalive_time;    /* time before keep alive takes place */

  unsigned int    keepalive_intvl;  /* time interval between keep alive probes */

  int      linger2;

/* Receiver side RTT estimation */

  struct {

    u32  rtt;

    u32  seq;

    u32  time;

  } rcv_rtt_est;

/* Receiver queue space */

  struct {

    int  space;

    u32  seq;

    u32  time;

  } rcvq_space;

/* TCP-specific MTU probe information. */

  struct {

    u32      probe_seq_start;

    u32      probe_seq_end;

  } mtu_probe;

#ifdef CONFIG_TCP_MD5SIG

/* TCP AF-Specific parts; only used by MD5 Signature support so far */

  const struct tcp_sock_af_ops  *af_specific;

/* TCP MD5 Signature Option information */

  struct tcp_md5sig_info  *md5sig_info;

#endif

};

在tcp_sock的结构体的第一个成员变量类型为struct inet_connection_sock(include/net/inet_connection_sock.h)

/** inet_connection_sock - INET connection oriented sock

 * @icsk_accept_queue:     FIFO of established children

 * @icsk_bind_hash:     Bind node

 * @icsk_timeout:     Timeout

 * @icsk_retransmit_timer: Resend (no ack)

 * @icsk_rto:       Retransmit timeout

 * @icsk_pmtu_cookie     Last pmtu seen by socket

 * @icsk_ca_ops       Pluggable congestion control hook

 * @icsk_af_ops       Operations which are AF_INET{4,6} specific

 * @icsk_ca_state:     Congestion control state

 * @icsk_retransmits:     Number of unrecovered [RTO] timeouts

 * @icsk_pending:     Scheduled timer event

 * @icsk_backoff:     Backoff

 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries

 * @icsk_probes_out:     unanswered 0 window probes

 * @icsk_ext_hdr_len:     Network protocol overhead (IP/IPv6 options)

 * @icsk_ack:       Delayed ACK control data

 * @icsk_mtup;       MTU probing control data

*/

struct inet_connection_sock {

  /* inet_sock has to be the first member! */

  struct inet_sock    icsk_inet;

  struct request_sock_queue icsk_accept_queue;

  struct inet_bind_bucket    *icsk_bind_hash;

  unsigned long      icsk_timeout;

   struct timer_list    icsk_retransmit_timer;

   struct timer_list    icsk_delack_timer;

  __u32        icsk_rto;

  __u32        icsk_pmtu_cookie;

  const struct tcp_congestion_ops *icsk_ca_ops;

  const struct inet_connection_sock_af_ops *icsk_af_ops;

  unsigned int      (*icsk_sync_mss)(struct sock *sk, u32 pmtu);

  __u8        icsk_ca_state;

  __u8        icsk_retransmits;

  __u8        icsk_pending;

  __u8        icsk_backoff;

  __u8        icsk_syn_retries;

  __u8        icsk_probes_out;

  __u16        icsk_ext_hdr_len;

  struct {

    __u8      pending;   /* ACK is pending         */

    __u8      quick;   /* Scheduled number of quick acks     */

    __u8      pingpong;   /* The session is interactive       */

    __u8      blocked;   /* Delayed ACK was blocked by socket lock */

    __u32      ato;     /* Predicted tick of soft clock     */

    unsigned long    timeout;   /* Currently scheduled timeout       */

    __u32      lrcvtime;   /* timestamp of last received data packet */

    __u16      last_seg_size; /* Size of last incoming segment     */

    __u16      rcv_mss;   /* MSS used for delayed ACK decisions     */

  } icsk_ack;

  struct {

    int      enabled;

    /* Range of MTUs to search */

    int      search_high;

    int      search_low;

    /* Information on the current probe. */

    int      probe_size;

  } icsk_mtup;

  u32        icsk_ca_priv[16];

#define ICSK_CA_PRIV_SIZE  (16 * sizeof(u32))

};

在 inet_connection_sock结构体中第一个成员变量类型为struct inet_sock(include/net/inet_sock.h)

/** struct inet_sock - representation of INET sockets

 * @sk - ancestor class

 * @pinet6 - pointer to IPv6 control block

 * @daddr - Foreign IPv4 addr

 * @rcv_saddr - Bound local IPv4 addr

 * @dport - Destination port

 * @num - Local port

 * @saddr - Sending source

 * @uc_ttl - Unicast TTL

 * @sport - Source port

 * @id - ID counter for DF pkts

 * @tos - TOS

 * @mc_ttl - Multicasting TTL

 * @is_icsk - is this an inet_connection_sock?

 * @mc_index - Multicast device index

 * @mc_list - Group array

 * @cork - info to build ip hdr on each ip frag while socket is corked

*/

struct inet_sock {

  /* sk and pinet6 has to be the first two members of inet_sock */

  struct sock    sk;

#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)

  struct ipv6_pinfo  *pinet6;

#endif

  /* Socket demultiplex comparisons on incoming packets. */

  __be32      daddr;

  __be32      rcv_saddr;

  __be16      dport;

  __u16      num;

  __be32      saddr;

  __s16      uc_ttl;

  __u16      cmsg_flags;

  struct ip_options  *opt;

  __be16      sport;

  __u16      id;

  __u8      tos;

  __u8      mc_ttl;

  __u8      pmtudisc;

  __u8      recverr:1,

        is_icsk:1,

        freebind:1,

        hdrincl:1,

        mc_loop:1,

        transparent:1,

        mc_all:1;

  int      mc_index;

  __be32      mc_addr;

  struct ip_mc_socklist  *mc_list;

  struct {

    unsigned int    flags;

    unsigned int    fragsize;

    struct ip_options  *opt;

    struct dst_entry  *dst;

    int      length; /* Total length of all frames */

    __be32      addr;

    struct flowi    fl;

  } cork;

};

而inet_sock的第一个成员正是struct sock类型，所以sk_prot_alloc直接返回struct sock *类型指针是没有问题的，接下来执行inet_create中的353行用inet_sk通过sk获得inet指针的值，inet_sk函数其实就相当于强制类型转换，返回的就是sk的指针。
接下来程序就一路返回到__sock_create，接着再返回到sys_socket中。在sys_socket中调用了最后一个函数sock_map_fd(net/socket.c，将socket指针sock与一个已经打开的文件号关联起来返回给用户程序。

/*

 *  Obtains the first available file descriptor and sets it up for use.

 *  These functions create file structures and maps them to fd space

 *  of the current process. On success it returns file descriptor

 *  and file struct implicitly stored in sock->file.

 *  Note that another thread may close file descriptor before we return

 *  from this function. We use the fact that now we do not refer

 *  to socket after mapping. If one day we will need it, this

 *  function will increment ref. count on file by 1.

 *  In any case returned fd MAY BE not valid!

 *  This race condition is unavoidable

 *  with shared fd spaces, we cannot solve it inside kernel,

 *  but we take care of internal coherence yet.

*/

static int sock_alloc_fd(struct file **filep, int flags)

  int fd;

  fd = get_unused_fd_flags(flags);

  if (likely(fd >= 0)) {

    struct file *file = get_empty_filp();

    *filep = file;

    if (unlikely(!file)) {

      put_unused_fd(fd);

      return -ENFILE;

  } else

    *filep = NULL;

  return fd;

static int sock_attach_fd(struct socket *sock, struct file *file, int flags)

  struct dentry *dentry;

  struct qstr name = { .name = "" };

  dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);

  if (unlikely(!dentry))

    return -ENOMEM;

  dentry->d_op = &sockfs_dentry_operations;

/*

   * We dont want to push this dentry into global dentry hash table.

   * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED

   * This permits a working /proc/$pid/fd/XXX on sockets

*/

  dentry->d_flags &= ~DCACHE_UNHASHED;

  d_instantiate(dentry, SOCK_INODE(sock));

  sock->file = file;

  init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,

      &socket_file_ops);

  SOCK_INODE(sock)->i_fop = &socket_file_ops;

  file->f_flags = O_RDWR | (flags & O_NONBLOCK);

  file->f_pos = 0;

  file->private_data = sock;

  return 0;

int sock_map_fd(struct socket *sock, int flags)

  struct file *newfile;

  int fd = sock_alloc_fd(&newfile, flags);

  if (likely(fd >= 0)) {

    int err = sock_attach_fd(sock, newfile, flags);

    if (unlikely(err < 0)) {

      put_filp(newfile);

      put_unused_fd(fd);

      return err;

    fd_install(fd, newfile);

  return fd;

fs/dcache.c

/* the caller must hold dcache_lock */

static void __d_instantiate(struct dentry *dentry, struct inode *inode)

  if (inode)

    list_add(&dentry->d_alias, &inode->i_dentry);

  dentry->d_inode = inode;

  fsnotify_d_instantiate(dentry, inode);

/**

 * d_instantiate - fill in inode information for a dentry

 * @entry: dentry to complete

 * @inode: inode to attach to this dentry

 * Fill in inode information in the entry.

 * This turns negative dentries into productive full members

 * of society.

 * NOTE! This assumes that the inode count has been incremented

 * (or otherwise set) by the caller to indicate that it is now

 * in use by the dcache.

*/

void d_instantiate(struct dentry *entry, struct inode * inode)

  BUG_ON(!list_empty(&entry->d_alias));

  spin_lock(&dcache_lock);

  __d_instantiate(entry, inode);

  spin_unlock(&dcache_lock);

  security_d_instantiate(entry, inode);

/net/socket.c

/*

 *  Socket files have a set of 'special' operations as well as the generic file ones. These don't appear

 *  in the operation structures but are done directly via the socketcall() multiplexor.

*/

static const struct file_operations socket_file_ops = {

  .owner =  THIS_MODULE,

  .llseek =  no_llseek,

  .aio_read =  sock_aio_read,

  .aio_write =  sock_aio_write,

  .poll =    sock_poll,

  .unlocked_ioctl = sock_ioctl,

#ifdef CONFIG_COMPAT

  .compat_ioctl = compat_sock_ioctl,

#endif

  .mmap =    sock_mmap,

  .open =    sock_no_open,  /* special open code to disallow open via /proc */

  .release =  sock_close,

  .fasync =  sock_fasync,

  .sendpage =  sock_sendpage,

  .splice_write = generic_splice_sendpage,

  .splice_read =  sock_splice_read,

};

在sock_map_fd中先通过402行获得一个未用的已经打开的文件号以及file结构，然后通过405行调用sock_attach_fd将文件号与sock相关联起来，在sock_attach_fd中先通地375行从sockfs中分配一个dentry，其中sock_mnt就是在描述sockfs中提到的，d_instantiate的作用就是将dentry与socket的inode关联起来，然后388行又将sock->file与file关联起来。389～390行将socket文件上的操作初始化为socket_file_ops。这样，通过send/recv进入内核将调用inet_stream_ops中的函数，而通过read/write调用将调用socket_file_ops中的函数。然后反回至sys_socket函数中，再经过系统调用切换到用户态，socket函数的整个调用过程完成。

秒客网

Linux 协议栈分析 socket——笔记

相关文章