【Linux4.1.12源码分析】VXLAN报文内核协议栈处理

时间:2022-07-20 11:04:54

4.1.12内核已经支持vxlan报文的gro功能,意味着vxlan报文交给协议栈之前,已经被聚合过了,而在早期的内核中聚合逻辑是在encap_rcv函数之后实现的。

之前分析的UDP报文处理中,可以知道如果udp_sock定义了encap_rcv函数,将会把报文交给该函数处理,而不是传统的保存到sock队列,唤醒进程收包。

udp_sock定义的encap_rcv函数是在vxlan_socket_create函数中设置的,实际是vxlan_udp_encap_recv函数。

vxlan_udp_encap_recv函数

/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct vxlan_sock *vs;
struct vxlanhdr *vxh;
u32 flags, vni;
struct vxlan_metadata md = {0};

/* Need Vxlan and inner Ethernet header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN)) //报文长度检测
goto error;

vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); //得到vxlan头指针,和UDP头长度相同,所以可以这么操作
flags = ntohl(vxh->vx_flags);
vni = ntohl(vxh->vx_vni);

if (flags & VXLAN_HF_VNI) { //发送的vxlan报文,该flag必须置1
flags &= ~VXLAN_HF_VNI;
} else {
/* VNI flag always required to be set */
goto bad_flags;
}

if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) //报文移动到内层报文
goto drop;
vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);

vs = rcu_dereference_sk_user_data(sk);
if (!vs)
goto drop;

if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { //VXLAN_HF_RCO意味着发送端的vxlan设置了VXLAN_F_REMCSUM_TX标记
vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni, //并且报文的ip_summed == CHECKSUM_PARTIAL
!!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); //remcsum检测,检测失败丢弃该报文
if (!vxh)
goto drop;

flags &= ~VXLAN_HF_RCO; //flags去掉VXLAN_HF_RCO标记
vni &= VXLAN_VNI_MASK; //vni去掉低8位内容,仅剩下vni ID
}

/* For backwards compatibility, only allow reserved fields to be
* used by VXLAN extensions if explicitly requested.
*/
if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) {
struct vxlanhdr_gbp *gbp;

gbp = (struct vxlanhdr_gbp *)vxh;
md.gbp = ntohs(gbp->policy_id);

if (gbp->dont_learn)
md.gbp |= VXLAN_GBP_DONT_LEARN;

if (gbp->policy_applied)
md.gbp |= VXLAN_GBP_POLICY_APPLIED;

flags &= ~VXLAN_GBP_USED_BITS;
}

if (flags || vni & ~VXLAN_VNI_MASK) { //flags没有其他标记,vni低8为0
/* If there are any unprocessed flags remaining treat
* this as a malformed packet. This behavior diverges from
* VXLAN RFC (RFC7348) which stipulates that bits in reserved
* in reserved fields are to be ignored. The approach here
* maintains compatibility with previous stack code, and also
* is more robust and provides a little more security in
* adding extensions to VXLAN.
*/

goto bad_flags;
}

md.vni = vxh->vx_vni;
vs->rcv(vs, skb, &md); //内核定义了vxlan_rcv,如果是内核自带OVS创建vxlan端口,则使用OVS定义的vxlan_rcv函数。
return 0;

drop:
/* Consume bad packet */
kfree_skb(skb);
return 0;

bad_flags:
netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));

error:
/* Return non vxlan pkt */
return 1;
}
vxlan_rcv函数(内核自带OVS创建vxlan端口时指定)

static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
struct vxlan_metadata *md)
{
struct ovs_tunnel_info tun_info;
struct vxlan_port *vxlan_port;
struct vport *vport = vs->data;
struct iphdr *iph;
struct ovs_vxlan_opts opts = {
.gbp = md->gbp,
};
__be64 key;
__be16 flags;

flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
vxlan_port = vxlan_vport(vport);
if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
flags |= TUNNEL_VXLAN_OPT;

/* Save outer tunnel values */
iph = ip_hdr(skb);
key = cpu_to_be64(ntohl(md->vni) >> 8);
ovs_flow_tun_info_init(&tun_info, iph,
udp_hdr(skb)->source, udp_hdr(skb)->dest,
key, flags, &opts, sizeof(opts));

ovs_vport_receive(vport, skb, &tun_info); //调用OVS收包函数
}
vxlan_rcv(内核自带)

static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
struct vxlan_metadata *md)
{
struct iphdr *oip = NULL;
struct ipv6hdr *oip6 = NULL;
struct vxlan_dev *vxlan;
struct pcpu_sw_netstats *stats;
union vxlan_addr saddr;
__u32 vni;
int err = 0;
union vxlan_addr *remote_ip;

vni = ntohl(md->vni) >> 8;
/* Is this VNI defined? */
vxlan = vxlan_vs_find_vni(vs, vni);
if (!vxlan)
goto drop;

remote_ip = &vxlan->default_dst.remote_ip;
skb_reset_mac_header(skb);
skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
skb->protocol = eth_type_trans(skb, vxlan->dev); //解析报文protocol,同时会设置skb的dev为vxlan设备
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); //报文移动到IP头,netif_receive_skb要求

/* Ignore packet loops (and multicast echo) */
if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) //报文源mac等于vxlan设备的mac,丢弃报文
goto drop;

/* Re-examine inner Ethernet packet */
if (remote_ip->sa.sa_family == AF_INET) {
oip = ip_hdr(skb);
saddr.sin.sin_addr.s_addr = oip->saddr;
saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
} else {
oip6 = ipv6_hdr(skb);
saddr.sin6.sin6_addr = oip6->saddr;
saddr.sa.sa_family = AF_INET6;
#endif
}

if ((vxlan->flags & VXLAN_F_LEARN) &&
vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) //vxlan fdb表学习,记录mac和ip的对应关系
goto drop;

skb_reset_network_header(skb);
skb->mark = md->gbp;

if (oip6)
err = IP6_ECN_decapsulate(oip6, skb);
if (oip)
err = IP_ECN_decapsulate(oip, skb); //内外层tos检测

if (unlikely(err)) {
if (log_ecn_error) {
if (oip6)
net_info_ratelimited("non-ECT from %pI6\n",
&oip6->saddr);
if (oip)
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
&oip->saddr, oip->tos);
}
if (err > 1) {
++vxlan->dev->stats.rx_frame_errors;
++vxlan->dev->stats.rx_errors;
goto drop;
}
}

stats = this_cpu_ptr(vxlan->dev->tstats);
u64_stats_update_begin(&stats->syncp);
stats->rx_packets++;
stats->rx_bytes += skb->len;
u64_stats_update_end(&stats->syncp);

netif_rx(skb); //交给协议栈处理,skb的dev为vxlan_dev

return;
drop:
/* Consume bad packet */
kfree_skb(skb);
}