TCP连接的建立（二）

被动打开

SYN cookies

TCP协议开辟了一个比較大的内存空间请求连接队列来存储连接请求块，当SYN请求不断添加，请求连接数目到达上限时，会致使系统丢弃SYN连接请求。SYN cookies技术就能够使server在半连接队列已满的情况下仍能处理新的SYN请求。

当半连接队列满时，SYN cookies并不丢弃SYN请求。而是通过加密技术来标识半连接状态。在TCP实现中，当收到client的SYN请求时，server须要回复SYN+ACK包给client，然后client再发送确认包给server。通常，server的初始序列号是由server依照一定的规律计算得到的随机数，而在SYN cookies中，server的初始序列号是由clientIP地址、clientport号、serverIP地址和serverport号、接收到的client初始序列号以及其它一些安全数值进行hash运算，并加密后得到的，称之为cookies。

当server遭受SYN攻击使得请求连接队列满时，server并不拒绝新的SYN请求，而是回复一个初始化序列号为cookies的SYN包给client。假设收到client的ACK段。server将client的ACK序列号减1得到的值。与用上述那些要素hash运算得到的值比較，假设相等。直接完毕三次握手。注意：此时并不比查看此连接是否属于请求连接队列。

启用SYN cookies是通过在启动环境中设置一下命令来完毕：

echo 1 > /proc/sys/net/ipv4/tcp_syncookies

第一次握手：接收SYN段

传输控制块接收处理的段都有tcp_v4_do_rcv()处理，在该函数中再依据不同的状态由不同的函数处理。

/* The socket must have it's spinlock held when we get

 * here.

 *

 * We have a potential double-lock case here, so even when

 * doing backlog processing we use the BH locking scheme.

 * This is because we cannot sleep with the original spinlock

 * held.

 */

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)

{

	struct sock *rsk;

#ifdef CONFIG_TCP_MD5SIG

	/*

	 * We really want to reject the packet as early as possible

	 * if:

	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option

	 *  o There is an MD5 option and we're not expecting one

	 */

	if (tcp_v4_inbound_md5_hash(sk, skb))

		goto discard;

#endif

	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */

		TCP_CHECK_TIMER(sk);

		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {

			rsk = sk;

			goto reset;

		}

		TCP_CHECK_TIMER(sk);

		return 0;

	}

	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))

		goto csum_err;

	if (sk->sk_state == TCP_LISTEN) {

		struct sock *nsk = tcp_v4_hnd_req(sk, skb);

		if (!nsk)

			goto discard;

		if (nsk != sk) {

			if (tcp_child_process(sk, nsk, skb)) {

				rsk = nsk;

				goto reset;

			}

			return 0;

		}

	}

	TCP_CHECK_TIMER(sk);

	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {

		rsk = sk;

		goto reset;

	}

	TCP_CHECK_TIMER(sk);

	return 0;

reset:

	tcp_v4_send_reset(rsk, skb);

discard:

	kfree_skb(skb);

	/* Be careful here. If this function gets more complicated and

	 * gcc suffers from register pressure on the x86, sk (in %ebx)

	 * might be destroyed here. This current version compiles correctly,

	 * but you have been warned.

	 */

	return 0;

csum_err:

	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);

	goto discard;

}

第二次握手：发送SYN+ACK段

tcp_v4_send_synack()用来为服务端构造回应client连接请求SYN段的SYN+ACK段，并将其封装在IP数据报中发送给client。

/*

 *	Send a SYN-ACK after having received a SYN.

 *	This still operates on a request_sock only, not on a big

 *	socket.

 */

static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,

				struct dst_entry *dst)

{

	const struct inet_request_sock *ireq = inet_rsk(req);

	int err = -1;

	struct sk_buff * skb;

	/* First, grab a route. */

	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)

		return -1;

	skb = tcp_make_synack(sk, dst, req);

	if (skb) {

		struct tcphdr *th = tcp_hdr(skb);

		th->check = tcp_v4_check(skb->len,

					 ireq->loc_addr,

					 ireq->rmt_addr,

					 csum_partial(th, skb->len,

						      skb->csum));

		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,

					    ireq->rmt_addr,

					    ireq->opt);

		err = net_xmit_eval(err);

	}

	dst_release(dst);

	return err;

}

static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)

{

	return __tcp_v4_send_synack(sk, req, NULL);

}

第三次握手：接收ACK段

服务端接收到SYN段后，会为将建立的连接创建一个连接请求块，同一时候发送SYN+ACK段给client作为回应，然后启动建立连接定时器，等待client最后一次握手的ACK段

connect系统调用的实现

inet_stream_connect()是connect系统调用的套接口层实现，首先校验设置的地址族，然后校验套接口状态，套接口状态为SS_UNCONNECTED时调用传输层接口。TCP中为tcp_v4_connect()。最后，等待连接的完毕或失败。

/*

 *	Connect to a remote host. There is regrettably still a little

 *	TCP 'magic' in here.

 */

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,

			int addr_len, int flags)

{

	struct sock *sk = sock->sk;

	int err;

	long timeo;

	lock_sock(sk);

	if (uaddr->sa_family == AF_UNSPEC) {

		err = sk->sk_prot->disconnect(sk, flags);

		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;

		goto out;

	}

	switch (sock->state) {

	default:

		err = -EINVAL;

		goto out;

	case SS_CONNECTED:

		err = -EISCONN;

		goto out;

	case SS_CONNECTING:

		err = -EALREADY;

		/* Fall out of switch with err, set for this state */

		break;

	case SS_UNCONNECTED:

		err = -EISCONN;

		if (sk->sk_state != TCP_CLOSE)

			goto out;

		err = sk->sk_prot->connect(sk, uaddr, addr_len);

		if (err < 0)

			goto out;

		sock->state = SS_CONNECTING;

		/* Just entered SS_CONNECTING state; the only

		 * difference is that return value in non-blocking

		 * case is EINPROGRESS, rather than EALREADY.

		 */

		err = -EINPROGRESS;

		break;

	}

	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {

		/* Error code is set above */

		if (!timeo || !inet_wait_for_connect(sk, timeo))

			goto out;

		err = sock_intr_errno(timeo);

		if (signal_pending(current))

			goto out;

	}

	/* Connection was closed by RST, timeout, ICMP error

	 * or another process disconnected us.

	 */

	if (sk->sk_state == TCP_CLOSE)

		goto sock_error;

	/* sk->sk_err may be not zero now, if RECVERR was ordered by user

	 * and error was received after socket entered established state.

	 * Hence, it is handled normally after connect() return successfully.

	 */

	sock->state = SS_CONNECTED;

	err = 0;

out:

	release_sock(sk);

	return err;

sock_error:

	err = sock_error(sk) ? : -ECONNABORTED;

	sock->state = SS_UNCONNECTED;

	if (sk->sk_prot->disconnect(sk, flags))

		sock->state = SS_DISCONNECTING;

	goto out;

}

调用传输层接口，连接须要三层握手，connect接口仅仅是完毕发送SYN段过程，兴许两次握手由协议栈完毕。

SYN段发送成功后，兴许仅仅需等待第三次握手结束。

主动打开

第一次握手：发送SYN段

初始化client传输控制块并发送SYN段，通过tcp_v4_connect()完毕

/* This will initiate an outgoing connection. */

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)

{

	struct inet_sock *inet = inet_sk(sk);

	struct tcp_sock *tp = tcp_sk(sk);

	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;

	struct rtable *rt;

	__be32 daddr, nexthop;

	int tmp;

	int err;

	if (addr_len < sizeof(struct sockaddr_in))

		return -EINVAL;

	if (usin->sin_family != AF_INET)

		return -EAFNOSUPPORT;

	nexthop = daddr = usin->sin_addr.s_addr;

	if (inet->opt && inet->opt->srr) {

		if (!daddr)

			return -EINVAL;

		nexthop = inet->opt->faddr;

	}

	tmp = ip_route_connect(&rt, nexthop, inet->saddr,

			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,

			       IPPROTO_TCP,

			       inet->sport, usin->sin_port, sk, 1);

	if (tmp < 0) {

		if (tmp == -ENETUNREACH)

			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);

		return tmp;

	}

	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {

		ip_rt_put(rt);

		return -ENETUNREACH;

	}

	if (!inet->opt || !inet->opt->srr)

		daddr = rt->rt_dst;

	if (!inet->saddr)

		inet->saddr = rt->rt_src;

	inet->rcv_saddr = inet->saddr;

	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {

		/* Reset inherited state */

		tp->rx_opt.ts_recent	   = 0;

		tp->rx_opt.ts_recent_stamp = 0;

		tp->write_seq		   = 0;

	}

	if (tcp_death_row.sysctl_tw_recycle &&

	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {

		struct inet_peer *peer = rt_get_peer(rt);

		/*

		 * VJ's idea. We save last timestamp seen from

		 * the destination in peer table, when entering state

		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,

		 * when trying new connection.

		 */

		if (peer != NULL &&

		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {

			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;

			tp->rx_opt.ts_recent = peer->tcp_ts;

		}

	}

	inet->dport = usin->sin_port;

	inet->daddr = daddr;

	inet_csk(sk)->icsk_ext_hdr_len = 0;

	if (inet->opt)

		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;

	tp->rx_opt.mss_clamp = 536;

	/* Socket identity is still unknown (sport may be zero).

	 * However we set state to SYN-SENT and not releasing socket

	 * lock select source port, enter ourselves into the hash tables and

	 * complete initialization after this.

	 */

	tcp_set_state(sk, TCP_SYN_SENT);

	err = inet_hash_connect(&tcp_death_row, sk);

	if (err)

		goto failure;

	err = ip_route_newports(&rt, IPPROTO_TCP,

				inet->sport, inet->dport, sk);

	if (err)

		goto failure;

	/* OK, now commit destination to socket.  */

	sk->sk_gso_type = SKB_GSO_TCPV4;

	sk_setup_caps(sk, &rt->u.dst);

	if (!tp->write_seq)

		tp->write_seq = secure_tcp_sequence_number(inet->saddr,

							   inet->daddr,

							   inet->sport,

							   usin->sin_port);

	inet->id = tp->write_seq ^ jiffies;

	err = tcp_connect(sk);

	rt = NULL;

	if (err)

		goto failure;

	return 0;

failure:

	/*

	 * This unhashes the socket and releases the local port,

	 * if necessary.

	 */

	tcp_set_state(sk, TCP_CLOSE);

	ip_rt_put(rt);

	sk->sk_route_caps = 0;

	inet->dport = 0;

	return err;

}

第二次握手：接收SYN+ACK段

处于SYN_SENT状态的传输控制块，通过tcp_rcv_state_process()来处理。

/*

 *	This function implements the receiving procedure of RFC 793 for

 *	all states except ESTABLISHED and TIME_WAIT.

 *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be

 *	address independent.

 */

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,

			  struct tcphdr *th, unsigned len)

{

	struct tcp_sock *tp = tcp_sk(sk);

	struct inet_connection_sock *icsk = inet_csk(sk);

	int queued = 0;

	int res;

	tp->rx_opt.saw_tstamp = 0;

	switch (sk->sk_state) {

	case TCP_CLOSE:

		goto discard;

	case TCP_LISTEN:

		if (th->ack)

			return 1;

		if (th->rst)

			goto discard;

		if (th->syn) {

			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)

				return 1;

			/* Now we have several options: In theory there is

			 * nothing else in the frame. KA9Q has an option to

			 * send data with the syn, BSD accepts data with the

			 * syn up to the [to be] advertised window and

			 * Solaris 2.1 gives you a protocol error. For now

			 * we just ignore it, that fits the spec precisely

			 * and avoids incompatibilities. It would be nice in

			 * future to drop through and process the data.

			 *

			 * Now that TTCP is starting to be used we ought to

			 * queue this data.

			 * But, this leaves one open to an easy denial of

			 * service attack, and SYN cookies can't defend

			 * against this problem. So, we drop the data

			 * in the interest of security over speed unless

			 * it's still in use.

			 */

			kfree_skb(skb);

			return 0;

		}

		goto discard;

	case TCP_SYN_SENT:

		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);

		if (queued >= 0)

			return queued;

		/* Do step6 onward by hand. */

		tcp_urg(sk, skb, th);

		__kfree_skb(skb);

		tcp_data_snd_check(sk);

		return 0;

	}

	res = tcp_validate_incoming(sk, skb, th, 0);

	if (res <= 0)

		return -res;

	/* step 5: check the ACK field */

	if (th->ack) {

		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;

		switch (sk->sk_state) {

		case TCP_SYN_RECV:

			if (acceptable) {

				tp->copied_seq = tp->rcv_nxt;

				smp_mb();

				tcp_set_state(sk, TCP_ESTABLISHED);

				sk->sk_state_change(sk);

				/* Note, that this wakeup is only for marginal

				 * crossed SYN case. Passively open sockets

				 * are not waked up, because sk->sk_sleep ==

				 * NULL and sk->sk_socket == NULL.

				 */

				if (sk->sk_socket)

					sk_wake_async(sk,

						      SOCK_WAKE_IO, POLL_OUT);

				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;

				tp->snd_wnd = ntohs(th->window) <<

					      tp->rx_opt.snd_wscale;

				tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

				/* tcp_ack considers this ACK as duplicate

				 * and does not calculate rtt.

				 * Force it here.

				 */

				tcp_ack_update_rtt(sk, 0, 0);

				if (tp->rx_opt.tstamp_ok)

					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

				/* Make sure socket is routed, for

				 * correct metrics.

				 */

				icsk->icsk_af_ops->rebuild_header(sk);

				tcp_init_metrics(sk);

				tcp_init_congestion_control(sk);

				/* Prevent spurious tcp_cwnd_restart() on

				 * first data packet.

				 */

				tp->lsndtime = tcp_time_stamp;

				tcp_mtup_init(sk);

				tcp_initialize_rcv_mss(sk);

				tcp_init_buffer_space(sk);

				tcp_fast_path_on(tp);

			} else {

				return 1;

			}

			break;

		case TCP_FIN_WAIT1:

			if (tp->snd_una == tp->write_seq) {

				tcp_set_state(sk, TCP_FIN_WAIT2);

				sk->sk_shutdown |= SEND_SHUTDOWN;

				dst_confirm(sk->sk_dst_cache);

				if (!sock_flag(sk, SOCK_DEAD))

					/* Wake up lingering close() */

					sk->sk_state_change(sk);

				else {

					int tmo;

					if (tp->linger2 < 0 ||

					    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&

					     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {

						tcp_done(sk);

						NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);

						return 1;

					}

					tmo = tcp_fin_time(sk);

					if (tmo > TCP_TIMEWAIT_LEN) {

						inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);

					} else if (th->fin || sock_owned_by_user(sk)) {

						/* Bad case. We could lose such FIN otherwise.

						 * It is not a big problem, but it looks confusing

						 * and not so rare event. We still can lose it now,

						 * if it spins in bh_lock_sock(), but it is really

						 * marginal case.

						 */

						inet_csk_reset_keepalive_timer(sk, tmo);

					} else {

						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);

						goto discard;

					}

				}

			}

			break;

		case TCP_CLOSING:

			if (tp->snd_una == tp->write_seq) {

				tcp_time_wait(sk, TCP_TIME_WAIT, 0);

				goto discard;

			}

			break;

		case TCP_LAST_ACK:

			if (tp->snd_una == tp->write_seq) {

				tcp_update_metrics(sk);

				tcp_done(sk);

				goto discard;

			}

			break;

		}

	} else

		goto discard;

	/* step 6: check the URG bit */

	tcp_urg(sk, skb, th);

	/* step 7: process the segment text */

	switch (sk->sk_state) {

	case TCP_CLOSE_WAIT:

	case TCP_CLOSING:

	case TCP_LAST_ACK:

		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))

			break;

	case TCP_FIN_WAIT1:

	case TCP_FIN_WAIT2:

		/* RFC 793 says to queue data in these states,

		 * RFC 1122 says we MUST send a reset.

		 * BSD 4.4 also does reset.

		 */

		if (sk->sk_shutdown & RCV_SHUTDOWN) {

			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&

			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {

				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);

				tcp_reset(sk);

				return 1;

			}

		}

		/* Fall through */

	case TCP_ESTABLISHED:

		tcp_data_queue(sk, skb);

		queued = 1;

		break;

	}

	/* tcp_data could move socket to TIME-WAIT */

	if (sk->sk_state != TCP_CLOSE) {

		tcp_data_snd_check(sk);

		tcp_ack_snd_check(sk);

	}

	if (!queued) {

discard:

		__kfree_skb(skb);

	}

	return 0;

}

第三次握手：发送ACK段

tcp_send_ack()用来发送一个ACK段，同一时候更新窗体

/* This routine sends an ack and also updates the window. */

void tcp_send_ack(struct sock *sk)

{

	struct sk_buff *buff;

	/* If we have been reset, we may not send again. */

	if (sk->sk_state == TCP_CLOSE)

		return;

	/* We are not putting this on the write queue, so

	 * tcp_transmit_skb() will set the ownership to this

	 * sock.

	 */

	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);

	if (buff == NULL) {

		inet_csk_schedule_ack(sk);

		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;

		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,

					  TCP_DELACK_MAX, TCP_RTO_MAX);

		return;

	}

	/* Reserve space for headers and prepare control bits. */

	skb_reserve(buff, MAX_TCP_HEADER);

	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);

	/* Send it off, this clears delayed acks for us. */

	TCP_SKB_CB(buff)->when = tcp_time_stamp;

	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);

}

发送ACK段时，TCP必须不在CLOSE状态。

为ACK段分配一个SKB，假设分配失败则在启动延时定时器后返回。

秒客网

TCP连接的建立（二）

相关文章