socket相关系统调用的调用流程

最近一直在读内核网络协议栈源码，这里以ipv4/tcp为例对socket相关系统调用的流程做一个简要整理，这些相关系统调用的内部细节虽然各有不同，但其调用流程则基本一致；

调用流程：

(1)系统调用 –> (2)查找socket –> (3)执行socket的对应操作函数 –> (4)执行传输层协议的对应操作函数；

中间核心数据结构为inetws_array[]，位于af_inet.c，以第一个元素type=SOCK_STREAM，protocol=IPPROTO_TCP为例，该类型适用与tcp协议，当创建tcp socket时，其操作socket->ops赋值为&inet_stream_ops，对应的传输控制块操作sock->sk_prot赋值为&tcp_prot；

 /* Upon startup we insert all the elements in inetsw_array[] into

  * the linked list inetsw.

  */

 static struct inet_protosw inetsw_array[] =

 {

     {

         .type =       SOCK_STREAM,

         .protocol =   IPPROTO_TCP,

         .prot =       &tcp_prot,

         .ops =        &inet_stream_ops,

         .flags =      INET_PROTOSW_PERMANENT |

                   INET_PROTOSW_ICSK,

     },

     {

         .type =       SOCK_DGRAM,

         .protocol =   IPPROTO_UDP,

         .prot =       &udp_prot,

         .ops =        &inet_dgram_ops,

         .flags =      INET_PROTOSW_PERMANENT,

        },

        {

         .type =       SOCK_DGRAM,

         .protocol =   IPPROTO_ICMP,

         .prot =       &ping_prot,

         .ops =        &inet_sockraw_ops,

         .flags =      INET_PROTOSW_REUSE,

        },

        {

            .type =       SOCK_RAW,

            .protocol =   IPPROTO_IP,    /* wild card */

            .prot =       &raw_prot,

            .ops =        &inet_sockraw_ops,

            .flags =      INET_PROTOSW_REUSE,

        }

 };

查看inet_stream_ops结构会发现，其中包含了各种socket系统调用的对应的处理函数；

 const struct proto_ops inet_stream_ops = {

     .family           = PF_INET,

     .owner           = THIS_MODULE,

     .release       = inet_release,

     .bind           = inet_bind,

     .connect       = inet_stream_connect,

     .socketpair       = sock_no_socketpair,

     .accept           = inet_accept,

     .getname       = inet_getname,

     .poll           = tcp_poll,

     .ioctl           = inet_ioctl,

     .listen           = inet_listen,

     .shutdown       = inet_shutdown,

     .setsockopt       = sock_common_setsockopt,

     .getsockopt       = sock_common_getsockopt,

     .sendmsg       = inet_sendmsg,

     .recvmsg       = inet_recvmsg,

     .mmap           = sock_no_mmap,

     .sendpage       = inet_sendpage,

     .splice_read       = tcp_splice_read,

     .read_sock       = tcp_read_sock,

     .peek_len       = tcp_peek_len,

 #ifdef CONFIG_COMPAT

     .compat_setsockopt = compat_sock_common_setsockopt,

     .compat_getsockopt = compat_sock_common_getsockopt,

     .compat_ioctl       = inet_compat_ioctl,

 #endif

 };

具体实例，以tcp bind系统调用为例：

 SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)

 {

     struct socket *sock;

     struct sockaddr_storage address;

     int err, fput_needed;

     /* 获取socket ，fput_need标识是否需要减少文件引用计数*/

     sock = sockfd_lookup_light(fd, &err, &fput_needed);

     if (sock) {

         /* 将用户空间地址复制到内核空间 */

         err = move_addr_to_kernel(umyaddr, addrlen, &address);

         if (err >= ) {

             /* 安全模块的bind检查 */

             err = security_socket_bind(sock,

                            (struct sockaddr *)&address,

                            addrlen);

             if (!err)

                 /* 调用socket的bind操作 */

                 err = sock->ops->bind(sock,

                               (struct sockaddr *)

                               &address, addrlen);

         }

         /* 根据fput_needed决定是否减少引用计数 */

         fput_light(sock->file, fput_needed);

     }

     return err;

 }

上面的sock->ops->bind操作实际是调用了inet_stream_ops.bind

 /* 地址绑定 */

 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)

 {

     /*  省略无关代码 */

     /* If the socket has its own bind function then use it. (RAW) */

     /*

         如果传输控制块有自己的bind操作则调用，

         目前只有raw实现了自己的bind

     */

     if (sk->sk_prot->bind) {

         err = sk->sk_prot->bind(sk, uaddr, addr_len);

         goto out;

     }

     /* 省略无关代码 */

     /*

         端口不为0，或者端口为0允许绑定

         则使用协议的具体获取端口函数绑定端口

     */

     if ((snum || !inet->bind_address_no_port) &&

         sk->sk_prot->get_port(sk, snum)) {

         /* 绑定失败 */

         inet->inet_saddr = inet->inet_rcv_saddr = ;

         /* 端口在使用中 */

         err = -EADDRINUSE;

         goto out_release_sock;

     }

    /* 省略无关代码 */

 out_release_sock:

     release_sock(sk);

 out:

     return err;

 }

上面的sk->sk_prot->bind以及sk->sk_prot->get_port为具体传输层实现的对应操作函数，其中只有raw socket实现了bind操作，我们不关注，而以tcp的get_port操作为例，实际上也就是调用了tcp_prot.get_port，具体tcp实现为inet_csk_get_port；（该函数尚未分析，后续补充）

 /* Obtain a reference to a local port for the given sock,

  * if snum is zero it means select any available local port.

  * We try to allocate an odd port (and leave even ports for connect())

  */

 int inet_csk_get_port(struct sock *sk, unsigned short snum)

 {

     bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;

     struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;

     int ret = , port = snum;

     struct inet_bind_hashbucket *head;

     struct net *net = sock_net(sk);

     struct inet_bind_bucket *tb = NULL;

     kuid_t uid = sock_i_uid(sk);

     if (!port) {

         head = inet_csk_find_open_port(sk, &tb, &port);

         if (!head)

             return ret;

         if (!tb)

             goto tb_not_found;

         goto success;

     }

     head = &hinfo->bhash[inet_bhashfn(net, port,

                       hinfo->bhash_size)];

     spin_lock_bh(&head->lock);

     inet_bind_bucket_for_each(tb, &head->chain)

         if (net_eq(ib_net(tb), net) && tb->port == port)

             goto tb_found;

 tb_not_found:

     tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,

                      net, head, port);

     if (!tb)

         goto fail_unlock;

 tb_found:

     if (!hlist_empty(&tb->owners)) {

         if (sk->sk_reuse == SK_FORCE_REUSE)

             goto success;

         if ((tb->fastreuse >  && reuse) ||

             sk_reuseport_match(tb, sk))

             goto success;

         if (inet_csk_bind_conflict(sk, tb, true, true))

             goto fail_unlock;

     }

 success:

     if (!hlist_empty(&tb->owners)) {

         tb->fastreuse = reuse;

         if (sk->sk_reuseport) {

             tb->fastreuseport = FASTREUSEPORT_ANY;

             tb->fastuid = uid;

             tb->fast_rcv_saddr = sk->sk_rcv_saddr;

             tb->fast_ipv6_only = ipv6_only_sock(sk);

 #if IS_ENABLED(CONFIG_IPV6)

             tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;

 #endif

         } else {

             tb->fastreuseport = ;

         }

     } else {

         if (!reuse)

             tb->fastreuse = ;

         if (sk->sk_reuseport) {

             /* We didn't match or we don't have fastreuseport set on

              * the tb, but we have sk_reuseport set on this socket

              * and we know that there are no bind conflicts with

              * this socket in this tb, so reset our tb's reuseport

              * settings so that any subsequent sockets that match

              * our current socket will be put on the fast path.

              *

              * If we reset we need to set FASTREUSEPORT_STRICT so we

              * do extra checking for all subsequent sk_reuseport

              * socks.

              */

             if (!sk_reuseport_match(tb, sk)) {

                 tb->fastreuseport = FASTREUSEPORT_STRICT;

                 tb->fastuid = uid;

                 tb->fast_rcv_saddr = sk->sk_rcv_saddr;

                 tb->fast_ipv6_only = ipv6_only_sock(sk);

 #if IS_ENABLED(CONFIG_IPV6)

                 tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;

 #endif

             }

         } else {

             tb->fastreuseport = ;

         }

     }

     if (!inet_csk(sk)->icsk_bind_hash)

         inet_bind_hash(sk, tb, port);

     WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);

     ret = ;

 fail_unlock:

     spin_unlock_bh(&head->lock);

     return ret;

 }

秒客网

socket相关系统调用的调用流程

相关文章