TCP连接的建立(二)

时间:2023-03-09 04:03:42
TCP连接的建立(二)

被动打开

SYN cookies

TCP协议开辟了一个比較大的内存空间请求连接队列来存储连接请求块,当SYN请求不断添加,请求连接数目到达上限时,会致使系统丢弃SYN连接请求。SYN cookies技术就能够使server在半连接队列已满的情况下仍能处理新的SYN请求。

当半连接队列满时,SYN cookies并不丢弃SYN请求。而是通过加密技术来标识半连接状态。在TCP实现中,当收到client的SYN请求时,server须要回复SYN+ACK包给client,然后client再发送确认包给server。通常,server的初始序列号是由server依照一定的规律计算得到的随机数,而在SYN cookies中,server的初始序列号是由clientIP地址、clientport号、serverIP地址和serverport号、接收到的client初始序列号以及其它一些安全数值进行hash运算,并加密后得到的,称之为cookies。

当server遭受SYN攻击使得请求连接队列满时,server并不拒绝新的SYN请求,而是回复一个初始化序列号为cookies的SYN包给client。假设收到client的ACK段。server将client的ACK序列号减1得到的值。与用上述那些要素hash运算得到的值比較,假设相等。直接完毕三次握手。注意:此时并不比查看此连接是否属于请求连接队列。

启用SYN cookies是通过在启动环境中设置一下命令来完毕:

echo 1 > /proc/sys/net/ipv4/tcp_syncookies

第一次握手:接收SYN段

传输控制块接收处理的段都有tcp_v4_do_rcv()处理,在该函数中再依据不同的状态由不同的函数处理。

/* The socket must have it's spinlock held when we get
* here.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
* o We're expecting an MD5'd packet and this is no MD5 tcp option
* o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;
} if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err; if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard; if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
} TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0; reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0; csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}

第二次握手:发送SYN+ACK段

tcp_v4_send_synack()用来为服务端构造回应client连接请求SYN段的SYN+ACK段,并将其封装在IP数据报中发送给client。

/*
* Send a SYN-ACK after having received a SYN.
* This still operates on a request_sock only, not on a big
* socket.
*/
static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
struct dst_entry *dst)
{
const struct inet_request_sock *ireq = inet_rsk(req);
int err = -1;
struct sk_buff * skb; /* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
return -1; skb = tcp_make_synack(sk, dst, req); if (skb) {
struct tcphdr *th = tcp_hdr(skb); th->check = tcp_v4_check(skb->len,
ireq->loc_addr,
ireq->rmt_addr,
csum_partial(th, skb->len,
skb->csum)); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
ireq->rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
} dst_release(dst);
return err;
}
static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
{
return __tcp_v4_send_synack(sk, req, NULL);
}

第三次握手:接收ACK段

服务端接收到SYN段后,会为将建立的连接创建一个连接请求块,同一时候发送SYN+ACK段给client作为回应,然后启动建立连接定时器,等待client最后一次握手的ACK段

connect系统调用的实现

inet_stream_connect()是connect系统调用的套接口层实现,首先校验设置的地址族,然后校验套接口状态,套接口状态为SS_UNCONNECTED时调用传输层接口。TCP中为tcp_v4_connect()。最后,等待连接的完毕或失败。

/*
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
long timeo; lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out;
} switch (sock->state) {
default:
err = -EINVAL;
goto out;
case SS_CONNECTED:
err = -EISCONN;
goto out;
case SS_CONNECTING:
err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
err = -EISCONN;
if (sk->sk_state != TCP_CLOSE)
goto out; err = sk->sk_prot->connect(sk, uaddr, addr_len);
if (err < 0)
goto out; sock->state = SS_CONNECTING; /* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
*/
err = -EINPROGRESS;
break;
} timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
/* Error code is set above */
if (!timeo || !inet_wait_for_connect(sk, timeo))
goto out; err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
} /* Connection was closed by RST, timeout, ICMP error
* or another process disconnected us.
*/
if (sk->sk_state == TCP_CLOSE)
goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user
* and error was received after socket entered established state.
* Hence, it is handled normally after connect() return successfully.
*/ sock->state = SS_CONNECTED;
err = 0;
out:
release_sock(sk);
return err; sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
if (sk->sk_prot->disconnect(sk, flags))
sock->state = SS_DISCONNECTING;
goto out;
}

调用传输层接口,连接须要三层握手,connect接口仅仅是完毕发送SYN段过程,兴许两次握手由协议栈完毕。

SYN段发送成功后,兴许仅仅需等待第三次握手结束。

主动打开

第一次握手:发送SYN段

初始化client传输控制块并发送SYN段,通过tcp_v4_connect()完毕

/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct rtable *rt;
__be32 daddr, nexthop;
int tmp;
int err; if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL; if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr;
if (inet->opt && inet->opt->srr) {
if (!daddr)
return -EINVAL;
nexthop = inet->opt->faddr;
} tmp = ip_route_connect(&rt, nexthop, inet->saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
inet->sport, usin->sin_port, sk, 1);
if (tmp < 0) {
if (tmp == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return tmp;
} if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
} if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst; if (!inet->saddr)
inet->saddr = rt->rt_src;
inet->rcv_saddr = inet->saddr; if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0;
} if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
/*
* VJ's idea. We save last timestamp seen from
* the destination in peer table, when entering state
* TIME-WAIT * and initialize rx_opt.ts_recent from it,
* when trying new connection.
*/
if (peer != NULL &&
peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
} inet->dport = usin->sin_port;
inet->daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; /* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure; err = ip_route_newports(&rt, IPPROTO_TCP,
inet->sport, inet->dport, sk);
if (err)
goto failure; /* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
inet->daddr,
inet->sport,
usin->sin_port); inet->id = tp->write_seq ^ jiffies; err = tcp_connect(sk);
rt = NULL;
if (err)
goto failure; return 0; failure:
/*
* This unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->dport = 0;
return err;
}

第二次握手:接收SYN+ACK段

处于SYN_SENT状态的传输控制块,通过tcp_rcv_state_process()来处理。

/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
*/ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
int res; tp->rx_opt.saw_tstamp = 0; switch (sk->sk_state) {
case TCP_CLOSE:
goto discard; case TCP_LISTEN:
if (th->ack)
return 1; if (th->rst)
goto discard; if (th->syn) {
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1; /* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
* Solaris 2.1 gives you a protocol error. For now
* we just ignore it, that fits the spec precisely
* and avoids incompatibilities. It would be nice in
* future to drop through and process the data.
*
* Now that TTCP is starting to be used we ought to
* queue this data.
* But, this leaves one open to an easy denial of
* service attack, and SYN cookies can't defend
* against this problem. So, we drop the data
* in the interest of security over speed unless
* it's still in use.
*/
kfree_skb(skb);
return 0;
}
goto discard; case TCP_SYN_SENT:
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued; /* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
} res = tcp_validate_incoming(sk, skb, th, 0);
if (res <= 0)
return -res; /* step 5: check the ACK field */
if (th->ack) {
int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; switch (sk->sk_state) {
case TCP_SYN_RECV:
if (acceptable) {
tp->copied_seq = tp->rcv_nxt;
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk); /* Note, that this wakeup is only for marginal
* crossed SYN case. Passively open sockets
* are not waked up, because sk->sk_sleep ==
* NULL and sk->sk_socket == NULL.
*/
if (sk->sk_socket)
sk_wake_async(sk,
SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) <<
tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); /* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
* Force it here.
*/
tcp_ack_update_rtt(sk, 0, 0); if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; /* Make sure socket is routed, for
* correct metrics.
*/
icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
tp->lsndtime = tcp_time_stamp; tcp_mtup_init(sk);
tcp_initialize_rcv_mss(sk);
tcp_init_buffer_space(sk);
tcp_fast_path_on(tp);
} else {
return 1;
}
break; case TCP_FIN_WAIT1:
if (tp->snd_una == tp->write_seq) {
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
dst_confirm(sk->sk_dst_cache); if (!sock_flag(sk, SOCK_DEAD))
/* Wake up lingering close() */
sk->sk_state_change(sk);
else {
int tmo; if (tp->linger2 < 0 ||
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
} tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
* and not so rare event. We still can lose it now,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
}
}
}
break; case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
goto discard;
}
break; case TCP_LAST_ACK:
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
goto discard;
}
break;
}
} else
goto discard; /* step 6: check the URG bit */
tcp_urg(sk, skb, th); /* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
}
/* Fall through */
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
break;
} /* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
} if (!queued) {
discard:
__kfree_skb(skb);
}
return 0;
}

第三次握手:发送ACK段

tcp_send_ack()用来发送一个ACK段,同一时候更新窗体

/* This routine sends an ack and also updates the window. */
void tcp_send_ack(struct sock *sk)
{
struct sk_buff *buff; /* If we have been reset, we may not send again. */
if (sk->sk_state == TCP_CLOSE)
return; /* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (buff == NULL) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
return;
} /* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); /* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
}

发送ACK段时,TCP必须不在CLOSE状态。

为ACK段分配一个SKB,假设分配失败则在启动延时定时器后返回。

相关文章