【Linux4.1.12源码分析】协议栈报文接收之IP层处理分析(ip_local_deliver)

时间:2022-08-16 11:05:19

报文提交给内核协议栈处理后,最终会调用到__netif_receive_skb_core函数,如果报文没有被rx_handler消费掉,最终会交给ptype_base中注册的协议处理,包括内核注册的协议,也包括raw socket等创建的协议处理。本文将分析普通ipv4报文的处理过程,处理入口函数为ip_rcv函数。

1、ip_rcv函数

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
const struct iphdr *iph;
u32 len;

/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
if (skb->pkt_type == PACKET_OTHERHOST) //丢弃掉不是发往本机的报文,网卡开启混杂模式会收到此类报文
goto drop;


IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);

skb = skb_share_check(skb, GFP_ATOMIC); //检查是否skb为share,是则克隆报文
if (!skb) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}

if (!pskb_may_pull(skb, sizeof(struct iphdr))) //确保skb还可以容纳标准的报头(即20字节)
goto inhdr_error;

iph = ip_hdr(skb); //得到IP头

/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/

if (iph->ihl < 5 || iph->version != 4) //ip头长度至少为20字节(ihl>=5),只支持v4
goto inhdr_error;

BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
IP_ADD_STATS_BH(dev_net(dev),
IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));

if (!pskb_may_pull(skb, iph->ihl*4)) //确保skb还可以容纳实际的报头(ihl*4)
goto inhdr_error;

iph = ip_hdr(skb);

if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) //ip头csum校验
goto csum_error;

len = ntohs(iph->tot_len);
if (skb->len < len) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;

/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) { //去除多余的字节
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}

skb->transport_header = skb->network_header + iph->ihl*4; //设置传输层header

/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); //清空cb,即inet_skb_parm值

/* Must drop socket now because of tproxy. */
skb_orphan(skb);

return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, //调用netfilter,实现iptables功能,通过后调用ip_rcv_finish
dev, NULL,
ip_rcv_finish);

csum_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
inhdr_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}

2、ip_rcv_finish函数

static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;

if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
const struct net_protocol *ipprot;
int protocol = iph->protocol; //得到传输层协议

ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->early_demux) {
ipprot->early_demux(skb); //对于socket报文,可以通过socket快速获取路由表
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
}

/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (!skb_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, //路由查询,决定后续处理:向上传递、转发、丢弃
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}

#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif

if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;

rt = skb_rtable(skb); //得到路由表项,统计组播和广播报文
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);

return dst_input(skb); //后续处理,本机处理为ip_local_deliver,转发为ip_forward

drop:
kfree_skb(skb);
return NET_RX_DROP;
}

3、ip_local_deliver函数

int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/

if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) //如果是ip分片报文,则需要报文组装完整后才能提交给上层
return 0;
}

return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
skb->dev, NULL,
ip_local_deliver_finish); //调用netfilter,实现iptables功能,通过后调用ip_local_deliver_finish
}
4、ip_local_deliver_finish函数

static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);

__skb_pull(skb, skb_network_header_len(skb)); //报文移动到传输层头

rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol; //得到IP头中的协议类型,即4层协议
const struct net_protocol *ipprot;
int raw;

resubmit:
raw = raw_local_deliver(skb, protocol); //AF_INET的raw sock处理入口

ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot) {
int ret;

if (!ipprot->no_policy) { //4.1.12内核中,所有协议的no_policy都为1,条件不成立
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { //ipsec策略检测
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb); //交给上层处理报文,UDP/TCP/ICMP等等
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
} else { //协议未定义
if (!raw) { //如果不是raw,则检测ipse策略,如果检测通过则发送ICMP消息
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
}
out:
rcu_read_unlock();

return 0;
}