From 20fd4d1f04da07d09192ad8ad366a70d5125bfaf Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:49:32 -0700 Subject: gre: Simplify gre protocol registration locking. Use cmpxchg() for atomic protocol registration which saves code and data space. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index b2e805a..1e294d5 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -26,46 +26,32 @@ static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; -static DEFINE_SPINLOCK(gre_proto_lock); int gre_add_protocol(const struct gre_protocol *proto, u8 version) { if (version >= GREPROTO_MAX) - goto err_out; - - spin_lock(&gre_proto_lock); - if (gre_proto[version]) - goto err_out_unlock; - - RCU_INIT_POINTER(gre_proto[version], proto); - spin_unlock(&gre_proto_lock); - return 0; + return -EINVAL; -err_out_unlock: - spin_unlock(&gre_proto_lock); -err_out: - return -1; + return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ? + 0 : -EBUSY; } EXPORT_SYMBOL_GPL(gre_add_protocol); int gre_del_protocol(const struct gre_protocol *proto, u8 version) { + int ret; + if (version >= GREPROTO_MAX) - goto err_out; - - spin_lock(&gre_proto_lock); - if (rcu_dereference_protected(gre_proto[version], - lockdep_is_held(&gre_proto_lock)) != proto) - goto err_out_unlock; - RCU_INIT_POINTER(gre_proto[version], NULL); - spin_unlock(&gre_proto_lock); + return -EINVAL; + + ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ? + 0 : -EBUSY; + + if (ret) + return ret; + synchronize_rcu(); return 0; - -err_out_unlock: - spin_unlock(&gre_proto_lock); -err_out: - return -1; } EXPORT_SYMBOL_GPL(gre_del_protocol); -- cgit v0.10.2 From bda7bb46343647f68591366731295a0f3eea59ed Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:49:38 -0700 Subject: gre: Allow multiple protocol listener for gre protocol. Currently there is only one user is allowed to register for gre protocol. Following patch adds de-multiplexer. So that multiple modules can listen on gre protocol e.g. kernel gre devices and ovs. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller diff --git a/include/net/gre.h b/include/net/gre.h index 9f03a39..c6ea0c7 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -7,6 +7,7 @@ #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 +#define GRE_IP_PROTO_MAX 2 struct gre_protocol { int (*handler)(struct sk_buff *skb); @@ -22,6 +23,29 @@ struct gre_base_hdr { int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); +struct gre_cisco_protocol { + int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi); + int (*err_handler)(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi); + u8 priority; +}; + +int gre_cisco_register(struct gre_cisco_protocol *proto); +int gre_cisco_unregister(struct gre_cisco_protocol *proto); + +static inline int ip_gre_calc_hlen(__be16 o_flags) +{ + int addend = 4; + + if (o_flags&TUNNEL_CSUM) + addend += 4; + if (o_flags&TUNNEL_KEY) + addend += 4; + if (o_flags&TUNNEL_SEQ) + addend += 4; + return addend; +} + static inline __be16 gre_flags_to_tnl_flags(__be16 flags) { __be16 tflags = 0; diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 1e294d5..8b9a373 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -13,6 +13,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include +#include #include #include #include @@ -24,8 +26,12 @@ #include #include +#include +#include +#include static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; +static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX]; int gre_add_protocol(const struct gre_protocol *proto, u8 version) { @@ -55,6 +61,173 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); +static __sum16 check_checksum(struct sk_buff *skb) +{ + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + csum = csum_fold(skb->csum); + + if (!csum) + break; + /* Fall through. */ + + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); + skb->ip_summed = CHECKSUM_COMPLETE; + break; + } + + return csum; +} + +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err) +{ + unsigned int ip_hlen = ip_hdrlen(skb); + const struct gre_base_hdr *greh; + __be32 *options; + int hdr_len; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) + return -EINVAL; + + greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; + + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + hdr_len = ip_gre_calc_hlen(tpi->flags); + + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + + greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + tpi->proto = greh->protocol; + + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (check_checksum(skb)) { + *csum_err = true; + return -EINVAL; + } + options++; + } + + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else + tpi->key = 0; + + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else + tpi->seq = 0; + + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = htons(ETH_P_IP); + if ((*(u8 *)options & 0xF0) != 0x40) { + hdr_len += 4; + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + } + } + return 0; +} + +static int gre_cisco_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + int i; + bool csum_err = false; + + if (parse_gre_header(skb, &tpi, &csum_err) < 0) + goto drop; + + rcu_read_lock(); + for (i = 0; i < GRE_IP_PROTO_MAX; i++) { + struct gre_cisco_protocol *proto; + int ret; + + proto = rcu_dereference(gre_cisco_proto_list[i]); + if (!proto) + continue; + ret = proto->handler(skb, &tpi); + if (ret == PACKET_RCVD) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: + kfree_skb(skb); + return 0; +} + +static void gre_cisco_err(struct sk_buff *skb, u32 info) +{ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + * + * Moreover, Cisco "wise men" put GRE key to the third word + * in GRE header. It makes impossible maintaining even soft + * state for keyed + * GRE tunnels with enabled checksum. Tell them "thank you". + * + * Well, I wonder, rfc1812 was written by Cisco employee, + * what the hell these idiots break standards established + * by themselves??? + */ + + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct tnl_ptk_info tpi; + bool csum_err = false; + int i; + + if (parse_gre_header(skb, &tpi, &csum_err)) { + if (!csum_err) /* ignore csum errors. */ + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + skb->dev->ifindex, 0, IPPROTO_GRE, 0); + return; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, + IPPROTO_GRE, 0); + return; + } + + rcu_read_lock(); + for (i = 0; i < GRE_IP_PROTO_MAX; i++) { + struct gre_cisco_protocol *proto; + + proto = rcu_dereference(gre_cisco_proto_list[i]); + if (!proto) + continue; + + if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD) + goto out; + + } +out: + rcu_read_unlock(); +} + static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; @@ -206,27 +379,68 @@ static const struct net_offload gre_offload = { }, }; +static const struct gre_protocol ipgre_protocol = { + .handler = gre_cisco_rcv, + .err_handler = gre_cisco_err, +}; + +int gre_cisco_register(struct gre_cisco_protocol *newp) +{ + struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) + &gre_cisco_proto_list[newp->priority]; + + return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY; +} +EXPORT_SYMBOL_GPL(gre_cisco_register); + +int gre_cisco_unregister(struct gre_cisco_protocol *del_proto) +{ + struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) + &gre_cisco_proto_list[del_proto->priority]; + int ret; + + ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL; + + if (ret) + return ret; + + synchronize_net(); + return 0; +} +EXPORT_SYMBOL_GPL(gre_cisco_unregister); + static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); - return -EAGAIN; + goto err; + } + + if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) { + pr_info("%s: can't add ipgre handler\n", __func__); + goto err_gre; } if (inet_add_offload(&gre_offload, IPPROTO_GRE)) { pr_err("can't add protocol offload\n"); - inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); - return -EAGAIN; + goto err_gso; } return 0; +err_gso: + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); +err_gre: + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +err: + return -EAGAIN; } static void __exit gre_exit(void) { inet_del_offload(&gre_offload, IPPROTO_GRE); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } @@ -236,4 +450,3 @@ module_exit(gre_exit); MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_LICENSE("GPL"); - diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a982657..19863a8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -121,103 +121,8 @@ static int ipgre_tunnel_init(struct net_device *dev); static int ipgre_net_id __read_mostly; static int gre_tap_net_id __read_mostly; -static __sum16 check_checksum(struct sk_buff *skb) -{ - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - csum = csum_fold(skb->csum); - - if (!csum) - break; - /* Fall through. */ - - case CHECKSUM_NONE: - skb->csum = 0; - csum = __skb_checksum_complete(skb); - skb->ip_summed = CHECKSUM_COMPLETE; - break; - } - - return csum; -} - -static int ip_gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags&TUNNEL_CSUM) - addend += 4; - if (o_flags&TUNNEL_KEY) - addend += 4; - if (o_flags&TUNNEL_SEQ) - addend += 4; - return addend; -} - -static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err, int *hdr_len) -{ - unsigned int ip_hlen = ip_hdrlen(skb); - const struct gre_base_hdr *greh; - __be32 *options; - - if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) - return -EINVAL; - - greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); - if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) - return -EINVAL; - - tpi->flags = gre_flags_to_tnl_flags(greh->flags); - *hdr_len = ip_gre_calc_hlen(tpi->flags); - - if (!pskb_may_pull(skb, *hdr_len)) - return -EINVAL; - - greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); - - tpi->proto = greh->protocol; - - options = (__be32 *)(greh + 1); - if (greh->flags & GRE_CSUM) { - if (check_checksum(skb)) { - *csum_err = true; - return -EINVAL; - } - options++; - } - - if (greh->flags & GRE_KEY) { - tpi->key = *options; - options++; - } else - tpi->key = 0; - - if (unlikely(greh->flags & GRE_SEQ)) { - tpi->seq = *options; - options++; - } else - tpi->seq = 0; - - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { - tpi->proto = htons(ETH_P_IP); - if ((*(u8 *)options & 0xF0) != 0x40) { - *hdr_len += 4; - if (!pskb_may_pull(skb, *hdr_len)) - return -EINVAL; - } - } - - return 0; -} - -static void ipgre_err(struct sk_buff *skb, u32 info) +static int ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -239,26 +144,18 @@ static void ipgre_err(struct sk_buff *skb, u32 info) const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; - struct tnl_ptk_info tpi; - int hdr_len; - bool csum_err = false; - - if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) { - if (!csum_err) /* ignore csum errors. */ - return; - } switch (type) { default: case ICMP_PARAMETERPROB: - return; + return PACKET_RCVD; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return; + return PACKET_RCVD; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -269,79 +166,61 @@ static void ipgre_err(struct sk_buff *skb, u32 info) break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return; + return PACKET_RCVD; break; case ICMP_REDIRECT: break; } - if (tpi.proto == htons(ETH_P_TEB)) + if (tpi->proto == htons(ETH_P_TEB)) itn = net_generic(net, gre_tap_net_id); else itn = net_generic(net, ipgre_net_id); iph = (const struct iphdr *)skb->data; - t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, - iph->daddr, iph->saddr, tpi.key); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->daddr, iph->saddr, tpi->key); if (t == NULL) - return; + return PACKET_REJECT; - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->parms.link, 0, IPPROTO_GRE, 0); - return; - } - if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, - IPPROTO_GRE, 0); - return; - } if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return; + return PACKET_RCVD; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return; + return PACKET_RCVD; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; + return PACKET_RCVD; } -static int ipgre_rcv(struct sk_buff *skb) +static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn; const struct iphdr *iph; struct ip_tunnel *tunnel; - struct tnl_ptk_info tpi; - int hdr_len; - bool csum_err = false; - - if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0) - goto drop; - if (tpi.proto == htons(ETH_P_TEB)) + if (tpi->proto == htons(ETH_P_TEB)) itn = net_generic(net, gre_tap_net_id); else itn = net_generic(net, ipgre_net_id); iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, - iph->saddr, iph->daddr, tpi.key); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->saddr, iph->daddr, tpi->key); if (tunnel) { - ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); - return 0; + ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); + return PACKET_RCVD; } - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); -drop: - kfree_skb(skb); - return 0; + return PACKET_REJECT; } static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb) @@ -708,9 +587,10 @@ static int ipgre_tunnel_init(struct net_device *dev) return ip_tunnel_init(dev); } -static const struct gre_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, +static struct gre_cisco_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, + .priority = 0, }; static int __net_init ipgre_init_net(struct net *net) @@ -978,7 +858,7 @@ static int __init ipgre_init(void) if (err < 0) goto pnet_tap_faied; - err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); + err = gre_cisco_register(&ipgre_protocol); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; @@ -997,7 +877,7 @@ static int __init ipgre_init(void) tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); + gre_cisco_unregister(&ipgre_protocol); add_proto_failed: unregister_pernet_device(&ipgre_tap_net_ops); pnet_tap_faied: @@ -1009,8 +889,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) - pr_info("%s: can't remove protocol\n", __func__); + gre_cisco_unregister(&ipgre_protocol); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); } -- cgit v0.10.2 From 752f36da68e9136df8918461d651723a43627e04 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:49:45 -0700 Subject: gre: export gre_build_header() function. This is required for ovs gre module. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller diff --git a/include/net/gre.h b/include/net/gre.h index c6ea0c7..cbb9d51 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -32,6 +32,8 @@ struct gre_cisco_protocol { int gre_cisco_register(struct gre_cisco_protocol *proto); int gre_cisco_unregister(struct gre_cisco_protocol *proto); +void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len); static inline int ip_gre_calc_hlen(__be16 o_flags) { diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 8b9a373..1cbc465 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -61,6 +61,38 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); +void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len) +{ + struct gre_base_hdr *greh; + + skb_push(skb, hdr_len); + + greh = (struct gre_base_hdr *)skb->data; + greh->flags = tnl_flags_to_gre_flags(tpi->flags); + greh->protocol = tpi->proto; + + if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + + if (tpi->flags&TUNNEL_SEQ) { + *ptr = tpi->seq; + ptr--; + } + if (tpi->flags&TUNNEL_KEY) { + *ptr = tpi->key; + ptr--; + } + if (tpi->flags&TUNNEL_CSUM && + !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { + *ptr = 0; + *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, + skb->len, 0)); + } + } +} +EXPORT_SYMBOL_GPL(gre_build_header); + static __sum16 check_checksum(struct sk_buff *skb) { __sum16 csum = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 19863a8..362c7c4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -248,40 +248,6 @@ error: return ERR_PTR(err); } -static struct sk_buff *gre_build_header(struct sk_buff *skb, - const struct tnl_ptk_info *tpi, - int hdr_len) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(tpi->flags); - greh->protocol = tpi->proto; - - if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (tpi->flags&TUNNEL_SEQ) { - *ptr = tpi->seq; - ptr--; - } - if (tpi->flags&TUNNEL_KEY) { - *ptr = tpi->key; - ptr--; - } - if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { - *(__sum16 *)ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); - } - } - - return skb; -} - static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) @@ -302,11 +268,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - skb = gre_build_header(skb, &tpi, tunnel->hlen); - if (unlikely(!skb)) { - dev->stats.tx_dropped++; - return; - } + gre_build_header(skb, &tpi, tunnel->hlen); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } -- cgit v0.10.2 From 45f2e9976cb6fc3f1cc533fd53fe74da5a9dbce4 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:49:51 -0700 Subject: gre: export gre_handle_offloads() function. This is required for OVS GRE offloading. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller diff --git a/include/net/gre.h b/include/net/gre.h index cbb9d51..a5a4ddf 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -34,6 +34,7 @@ int gre_cisco_register(struct gre_cisco_protocol *proto); int gre_cisco_unregister(struct gre_cisco_protocol *proto); void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, int hdr_len); +struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum); static inline int ip_gre_calc_hlen(__be16 o_flags) { diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 1cbc465..5ecc9c4 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -93,6 +93,35 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, } EXPORT_SYMBOL_GPL(gre_build_header); +struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum) +{ + int err; + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + if (skb_is_gso(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + goto error; + skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; + return skb; + } else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) { + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_NONE; + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(gre_handle_offloads); + static __sum16 check_checksum(struct sk_buff *skb) { __sum16 csum = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 362c7c4..c326e86 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -223,31 +223,6 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) return PACKET_REJECT; } -static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb) -{ - int err; - - if (skb_is_gso(skb)) { - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - goto error; - skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; - return skb; - } else if (skb->ip_summed == CHECKSUM_PARTIAL && - tunnel->parms.o_flags&TUNNEL_CSUM) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_NONE; - - return skb; - -error: - kfree_skb(skb); - return ERR_PTR(err); -} - static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) @@ -255,11 +230,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel *tunnel = netdev_priv(dev); struct tnl_ptk_info tpi; - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } - tpi.flags = tunnel->parms.o_flags; tpi.proto = proto; tpi.key = tunnel->parms.o_key; @@ -279,7 +249,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; - skb = handle_offloads(tunnel, skb); + skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); if (IS_ERR(skb)) goto out; @@ -318,7 +288,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); - skb = handle_offloads(tunnel, skb); + skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); if (IS_ERR(skb)) goto out; -- cgit v0.10.2 From 0e6fbc5b6c6218987c93b8c7ca60cf786062899d Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:49:56 -0700 Subject: ip_tunnels: extend iptunnel_xmit() Refactor various ip tunnels xmit functions and extend iptunnel_xmit() so that there is more code sharing. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f6dce13..284c6c0 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1021,7 +1021,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct rtable *rt; const struct iphdr *old_iph; - struct iphdr *iph; struct vxlanhdr *vxh; struct udphdr *uh; struct flowi4 fl4; @@ -1030,6 +1029,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 vni; __be16 df = 0; __u8 tos, ttl; + int err; dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; vni = rdst->remote_vni; @@ -1097,13 +1097,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan_encap_bypass(skb, vxlan, dst_vxlan); return NETDEV_TX_OK; } - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | - IPSKB_REROUTED); - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_FLAGS); vxh->vx_vni = htonl(vni << 8); @@ -1118,27 +1111,18 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, uh->len = htons(skb->len); uh->check = 0; - __skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; - iph->protocol = IPPROTO_UDP; - iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb); - iph->daddr = dst; - iph->saddr = fl4.saddr; - iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - tunnel_ip_select_ident(skb, old_iph, &rt->dst); - - nf_reset(skb); - vxlan_set_owner(dev, skb); if (handle_offloads(skb)) goto drop; - iptunnel_xmit(skb, dev); + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); + + err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, dst, + IPPROTO_UDP, tos, ttl, df); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return NETDEV_TX_OK; drop: diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 1be442f..b84f1ab 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -155,23 +155,27 @@ static inline void tunnel_ip_select_ident(struct sk_buff *skb, (skb_shinfo(skb)->gso_segs ?: 1) - 1); } -static inline void iptunnel_xmit(struct sk_buff *skb, struct net_device *dev) +int iptunnel_xmit(struct net *net, struct rtable *rt, + struct sk_buff *skb, + __be32 src, __be32 dst, __u8 proto, + __u8 tos, __u8 ttl, __be16 df); + +static inline void iptunnel_xmit_stats(int err, + struct net_device_stats *err_stats, + struct pcpu_tstats __percpu *stats) { - int err; - int pkt_len = skb->len - skb_transport_offset(skb); - struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats); + if (err > 0) { + struct pcpu_tstats *tstats = this_cpu_ptr(stats); - nf_reset(skb); - - err = ip_local_out(skb); - if (likely(net_xmit_eval(err) == 0)) { u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += pkt_len; + tstats->tx_bytes += err; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); + } else if (err < 0) { + err_stats->tx_errors++; + err_stats->tx_aborted_errors++; } else { - dev->stats.tx_errors++; - dev->stats.tx_aborted_errors++; + err_stats->tx_dropped++; } } #endif /* __NET_IP_TUNNELS_H */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 7fcf810..86ded0b 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ - inet_fragment.o ping.o + inet_fragment.o ping.o ip_tunnel_core.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e189db4..a06a2ed 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -491,19 +491,17 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *inner_iph; - struct iphdr *iph; struct flowi4 fl4; u8 tos, ttl; __be16 df; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int mtu; + int err; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ @@ -571,14 +569,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dev->stats.tx_carrier_errors++; goto tx_error; } - tdev = rt->dst.dev; - - if (tdev == dev) { + if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; goto tx_error; } - df = tnl_params->frag_off; if (df) @@ -596,6 +591,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (!skb_is_gso(skb) && (inner_iph->frag_off&htons(IP_DF)) && mtu < ntohs(inner_iph->tot_len)) { + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); goto tx_error; @@ -646,8 +642,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ttl = ip4_dst_hoplimit(&rt->dst); } - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr) - + rt->dst.header_len; + max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + + rt->dst.header_len; if (max_headroom > dev->needed_headroom) { dev->needed_headroom = max_headroom; if (skb_cow_head(skb, dev->needed_headroom)) { @@ -657,27 +653,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - - /* Push down and install the IP header. */ - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - - iph = ip_hdr(skb); - inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + err = iptunnel_xmit(dev_net(dev), rt, skb, + fl4.saddr, fl4.daddr, protocol, + ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; - iph->protocol = protocol; - iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); - iph->daddr = fl4.daddr; - iph->saddr = fl4.saddr; - iph->ttl = ttl; - tunnel_ip_select_ident(skb, inner_iph, &rt->dst); - - iptunnel_xmit(skb, dev); return; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c new file mode 100644 index 0000000..927687e --- /dev/null +++ b/net/ipv4/ip_tunnel_core.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int iptunnel_xmit(struct net *net, struct rtable *rt, + struct sk_buff *skb, + __be32 src, __be32 dst, __u8 proto, + __u8 tos, __u8 ttl, __be16 df) +{ + int pkt_len = skb->len; + struct iphdr *iph; + int err; + + nf_reset(skb); + secpath_reset(skb); + skb->rxhash = 0; + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + /* Push down and install the IP header. */ + __skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = proto; + iph->tos = tos; + iph->daddr = dst; + iph->saddr = src; + iph->ttl = ttl; + tunnel_ip_select_ident(skb, + (const struct iphdr *)skb_inner_network_header(skb), + &rt->dst); + + err = ip_local_out(skb); + if (unlikely(net_xmit_eval(err))) + pkt_len = 0; + return pkt_len; +} +EXPORT_SYMBOL_GPL(iptunnel_xmit); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 6b9c1f12..76bb8de 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -723,13 +723,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; const struct in6_addr *addr6; int addr_type; + u8 ttl; + int err; if (skb->protocol != htons(ETH_P_IPV6)) goto tx_error; @@ -872,34 +873,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb = new_skb; iph6 = ipv6_hdr(skb); } - - skb->transport_header = skb->network_header; - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags = 0; - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - - /* - * Push down and install the IPIP header. - */ - - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPV6; - iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - iph->daddr = fl4.daddr; - iph->saddr = fl4.saddr; - - if ((iph->ttl = tiph->ttl) == 0) - iph->ttl = iph6->hop_limit; - - skb->ip_summed = CHECKSUM_NONE; - ip_select_ident(iph, skb_dst(skb), NULL); - iptunnel_xmit(skb, dev); + ttl = tiph->ttl; + if (ttl == 0) + ttl = iph6->hop_limit; + tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); + + err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr, + IPPROTO_IPV6, tos, ttl, df); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; tx_error_icmp: -- cgit v0.10.2 From 3d7b46cd20e300bd6989fb1f43d46f1b9645816e Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:02 -0700 Subject: ip_tunnel: push generic protocol handling to ip_tunnel module. Process skb tunnel header before sending packet to protocol handler. this allows code sharing between gre and ovs gre modules. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index b84f1ab..32e130b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -155,6 +155,7 @@ static inline void tunnel_ip_select_ident(struct sk_buff *skb, (skb_shinfo(skb)->gso_segs ?: 1) - 1); } +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct net *net, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index 5ecc9c4..ba4803e 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -201,7 +201,8 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, return -EINVAL; } } - return 0; + + return iptunnel_pull_header(skb, hdr_len, tpi->proto); } static int gre_cisco_rcv(struct sk_buff *skb) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a06a2ed..bd227e5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -408,13 +408,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct iphdr *iph = ip_hdr(skb); int err; - secpath_reset(skb); - - skb->protocol = tpi->proto; - - skb->mac_header = skb->network_header; - __pskb_pull(skb, tunnel->hlen); - skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen); #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { /* Looped back packet, drop it! */ @@ -442,23 +435,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } - /* Warning: All skb pointers will be invalidated! */ - if (tunnel->dev->type == ARPHRD_ETHER) { - if (!pskb_may_pull(skb, ETH_HLEN)) { - tunnel->dev->stats.rx_length_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - - iph = ip_hdr(skb); - skb->protocol = eth_type_trans(skb, tunnel->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - } - - skb->pkt_type = PACKET_HOST; - __skb_tunnel_rx(skb, tunnel->dev); - - skb_reset_network_header(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -477,6 +453,12 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tstats->rx_bytes += skb->len; u64_stats_update_end(&tstats->syncp); + if (tunnel->dev->type == ARPHRD_ETHER) { + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } else { + skb->dev = tunnel->dev; + } gro_cells_receive(&tunnel->gro_cells, skb); return 0; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 927687e..7167b08 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,3 +86,37 @@ int iptunnel_xmit(struct net *net, struct rtable *rt, return pkt_len; } EXPORT_SYMBOL_GPL(iptunnel_xmit); + +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +{ + if (unlikely(!pskb_may_pull(skb, hdr_len))) + return -ENOMEM; + + skb_pull_rcsum(skb, hdr_len); + + if (inner_proto == htons(ETH_P_TEB)) { + struct ethhdr *eh = (struct ethhdr *)skb->data; + + if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) + return -ENOMEM; + + if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + skb->protocol = eh->h_proto; + else + skb->protocol = htons(ETH_P_802_2); + + } else { + skb->protocol = inner_proto; + } + + nf_reset(skb); + secpath_reset(skb); + if (!skb->l4_rxhash) + skb->rxhash = 0; + skb_dst_drop(skb); + skb->vlan_tci = 0; + skb_set_queue_mapping(skb, 0); + skb->pkt_type = PACKET_HOST; + return 0; +} +EXPORT_SYMBOL_GPL(iptunnel_pull_header); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 9df7ecd..e6905fb 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -188,8 +188,12 @@ static int ipip_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); struct ip_tunnel *tunnel; - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; + + iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 76bb8de..6cee844 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -640,9 +640,14 @@ static const struct tnl_ptk_info tpi = { static int ipip_rcv(struct sk_buff *skb) { - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; struct ip_tunnel *tunnel; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; + + iph = ip_hdr(skb); + tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { -- cgit v0.10.2 From 9a628224a61bbcd2b50b3ec96e661fbbb49b619a Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:07 -0700 Subject: ip_tunnel: Add dont fragment flag. This flag will be used by ovs tunneling. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 32e130b..10bbb42 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -73,6 +73,7 @@ struct ip_tunnel { #define TUNNEL_REC __cpu_to_be16(0x20) #define TUNNEL_VERSION __cpu_to_be16(0x40) #define TUNNEL_NO_KEY __cpu_to_be16(0x80) +#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) struct tnl_ptk_info { __be16 flags; -- cgit v0.10.2 From 74f84a5726c7d08c27745305e67474b8645c541d Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:12 -0700 Subject: openvswitch: Copy individual actions. Rather than validating actions and then copying all actiaons in one block, following patch does same operation in single pass. This validate and copy action one by one. This is required for ovs tunneling patch. This patch does not change any functionality. Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0f783d9..f14816b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -464,16 +464,89 @@ static int flush_flows(struct datapath *dp) return 0; } -static int validate_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth); +static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len) +{ + + struct sw_flow_actions *acts; + int new_acts_size; + int req_size = NLA_ALIGN(attr_len); + int next_offset = offsetof(struct sw_flow_actions, actions) + + (*sfa)->actions_len; + + if (req_size <= (ksize(*sfa) - next_offset)) + goto out; + + new_acts_size = ksize(*sfa) * 2; + + if (new_acts_size > MAX_ACTIONS_BUFSIZE) { + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + return ERR_PTR(-EMSGSIZE); + new_acts_size = MAX_ACTIONS_BUFSIZE; + } + + acts = ovs_flow_actions_alloc(new_acts_size); + if (IS_ERR(acts)) + return (void *)acts; + + memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); + acts->actions_len = (*sfa)->actions_len; + kfree(*sfa); + *sfa = acts; + +out: + (*sfa)->actions_len += req_size; + return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +{ + struct nlattr *a; + + a = reserve_sfa_size(sfa, nla_attr_size(len)); + if (IS_ERR(a)) + return PTR_ERR(a); + + a->nla_type = attrtype; + a->nla_len = nla_attr_size(len); + + if (data) + memcpy(nla_data(a), data, len); + memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + + return 0; +} + +static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype) +{ + int used = (*sfa)->actions_len; + int err; + + err = add_action(sfa, attrtype, NULL, 0); + if (err) + return err; + + return used; +} -static int validate_sample(const struct nlattr *attr, - const struct sw_flow_key *key, int depth) +static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset) +{ + struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset); + + a->nla_len = sfa->actions_len - st_offset; +} + +static int validate_and_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa); + +static int validate_and_copy_sample(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; const struct nlattr *a; - int rem; + int rem, start, err, st_acts; memset(attrs, 0, sizeof(attrs)); nla_for_each_nested(a, attr, rem) { @@ -492,7 +565,26 @@ static int validate_sample(const struct nlattr *attr, actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) return -EINVAL; - return validate_actions(actions, key, depth + 1); + + /* validation done, copy sample action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); + if (start < 0) + return start; + err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32)); + if (err) + return err; + st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); + if (st_acts < 0) + return st_acts; + + err = validate_and_copy_actions(actions, key, depth + 1, sfa); + if (err) + return err; + + add_nested_action_end(*sfa, st_acts); + add_nested_action_end(*sfa, start); + + return 0; } static int validate_tp_port(const struct sw_flow_key *flow_key) @@ -606,8 +698,24 @@ static int validate_userspace(const struct nlattr *attr) return 0; } -static int validate_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth) +static int copy_action(const struct nlattr *from, + struct sw_flow_actions **sfa) +{ + int totlen = NLA_ALIGN(from->nla_len); + struct nlattr *to; + + to = reserve_sfa_size(sfa, from->nla_len); + if (IS_ERR(to)) + return PTR_ERR(to); + + memcpy(to, from, totlen); + return 0; +} + +static int validate_and_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, + struct sw_flow_actions **sfa) { const struct nlattr *a; int rem, err; @@ -627,12 +735,14 @@ static int validate_actions(const struct nlattr *attr, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); + bool skip_copy; if (type > OVS_ACTION_ATTR_MAX || (action_lens[type] != nla_len(a) && action_lens[type] != (u32)-1)) return -EINVAL; + skip_copy = false; switch (type) { case OVS_ACTION_ATTR_UNSPEC: return -EINVAL; @@ -667,14 +777,20 @@ static int validate_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_sample(a, key, depth); + err = validate_and_copy_sample(a, key, depth, sfa); if (err) return err; + skip_copy = true; break; default: return -EINVAL; } + if (!skip_copy) { + err = copy_action(a, sfa); + if (err) + return err; + } } if (rem > 0) @@ -742,18 +858,16 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]); if (err) goto err_flow_free; - - err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0); - if (err) - goto err_flow_free; - flow->hash = ovs_flow_hash(&flow->key, key_len); - - acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]); + acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); err = PTR_ERR(acts); if (IS_ERR(acts)) goto err_flow_free; + + err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); rcu_assign_pointer(flow->sf_acts, acts); + if (err) + goto err_flow_free; OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; @@ -843,6 +957,66 @@ static struct genl_multicast_group ovs_dp_flow_multicast_group = { .name = OVS_FLOW_MCGROUP }; +static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb); +static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +{ + const struct nlattr *a; + struct nlattr *start; + int err = 0, rem; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); + if (!start) + return -EMSGSIZE; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + struct nlattr *st_sample; + + switch (type) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_SAMPLE_ATTR_ACTIONS: + st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!st_sample) + return -EMSGSIZE; + err = actions_to_attr(nla_data(a), nla_len(a), skb); + if (err) + return err; + nla_nest_end(skb, st_sample); + break; + } + } + + nla_nest_end(skb, start); + return err; +} + +static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb) +{ + const struct nlattr *a; + int rem, err; + + nla_for_each_attr(a, attr, len, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_ACTION_ATTR_SAMPLE: + err = sample_action_to_attr(a, skb); + if (err) + return err; + break; + default: + if (nla_put(skb, type, nla_len(a), nla_data(a))) + return -EMSGSIZE; + break; + } + } + + return 0; +} + static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) @@ -860,6 +1034,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, { const int skb_orig_len = skb->len; const struct sw_flow_actions *sf_acts; + struct nlattr *start; struct ovs_flow_stats stats; struct ovs_header *ovs_header; struct nlattr *nla; @@ -913,10 +1088,19 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, * This can only fail for dump operations because the skb is always * properly sized for single flows. */ - err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len, - sf_acts->actions); - if (err < 0 && skb_orig_len) - goto error; + start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); + if (start) { + err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb); + if (!err) + nla_nest_end(skb, start); + else { + if (skb_orig_len) + goto error; + + nla_nest_cancel(skb, start); + } + } else if (skb_orig_len) + goto nla_put_failure; return genlmsg_end(skb, ovs_header); @@ -961,6 +1145,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct datapath *dp; struct flow_table *table; + struct sw_flow_actions *acts = NULL; int error; int key_len; @@ -974,9 +1159,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0); - if (error) + acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); + error = PTR_ERR(acts); + if (IS_ERR(acts)) goto error; + + error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts); + if (error) + goto err_kfree; } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { error = -EINVAL; goto error; @@ -991,8 +1181,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) table = ovsl_dereference(dp->table); flow = ovs_flow_tbl_lookup(table, &key, key_len); if (!flow) { - struct sw_flow_actions *acts; - /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) @@ -1019,11 +1207,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) flow->key = key; clear_stats(flow); - /* Obtain actions. */ - acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]); - error = PTR_ERR(acts); - if (IS_ERR(acts)) - goto error_free_flow; rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ @@ -1036,7 +1219,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) } else { /* We found a matching flow. */ struct sw_flow_actions *old_acts; - struct nlattr *acts_attrs; /* Bail out if we're not allowed to modify an existing flow. * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL @@ -1051,21 +1233,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); - acts_attrs = a[OVS_FLOW_ATTR_ACTIONS]; - if (acts_attrs && - (old_acts->actions_len != nla_len(acts_attrs) || - memcmp(old_acts->actions, nla_data(acts_attrs), - old_acts->actions_len))) { - struct sw_flow_actions *new_acts; - - new_acts = ovs_flow_actions_alloc(acts_attrs); - error = PTR_ERR(new_acts); - if (IS_ERR(new_acts)) - goto err_unlock_ovs; - - rcu_assign_pointer(flow->sf_acts, new_acts); - ovs_flow_deferred_free_acts(old_acts); - } + rcu_assign_pointer(flow->sf_acts, acts); + ovs_flow_deferred_free_acts(old_acts); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, info->snd_seq, OVS_FLOW_CMD_NEW); @@ -1086,10 +1255,10 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); return 0; -error_free_flow: - ovs_flow_free(flow); err_unlock_ovs: ovs_unlock(); +err_kfree: + kfree(acts); error: return error; } @@ -1866,8 +2035,8 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, - OVS_VPORT_CMD_DEL); + reply = ovs_vport_cmd_build_info(vport, info->snd_portid, + info->snd_seq, OVS_VPORT_CMD_DEL); err = PTR_ERR(reply); if (IS_ERR(reply)) goto exit_unlock; @@ -1896,8 +2065,8 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(vport)) goto exit_unlock; - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, - OVS_VPORT_CMD_NEW); + reply = ovs_vport_cmd_build_info(vport, info->snd_portid, + info->snd_seq, OVS_VPORT_CMD_NEW); err = PTR_ERR(reply); if (IS_ERR(reply)) goto exit_unlock; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 093c191..940d4b8 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -198,20 +198,18 @@ void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) spin_unlock(&flow->lock); } -struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions) +struct sw_flow_actions *ovs_flow_actions_alloc(int size) { - int actions_len = nla_len(actions); struct sw_flow_actions *sfa; - if (actions_len > MAX_ACTIONS_BUFSIZE) + if (size > MAX_ACTIONS_BUFSIZE) return ERR_PTR(-EINVAL); - sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL); + sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) return ERR_PTR(-ENOMEM); - sfa->actions_len = actions_len; - nla_memcpy(sfa->actions, actions, actions_len); + sfa->actions_len = 0; return sfa; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 2a83e21..e370f62 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -130,7 +130,7 @@ struct sw_flow *ovs_flow_alloc(void); void ovs_flow_deferred_free(struct sw_flow *); void ovs_flow_free(struct sw_flow *flow); -struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *); +struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len); void ovs_flow_deferred_free_acts(struct sw_flow_actions *); int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, -- cgit v0.10.2 From 7d5437c709ded4f152cb8b305d17972d6707f20c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:18 -0700 Subject: openvswitch: Add tunneling interface. Add ovs tunnel interface for set tunnel action for userspace. Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 424672d..b15a445 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -246,11 +246,29 @@ enum ovs_key_attr { OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */ OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */ + OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ + +#ifdef __KERNEL__ + OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ +#endif __OVS_KEY_ATTR_MAX }; #define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1) +enum ovs_tunnel_key_attr { + OVS_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */ + OVS_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */ + OVS_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */ + OVS_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */ + OVS_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ + OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ + OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + __OVS_TUNNEL_KEY_ATTR_MAX +}; + +#define OVS_TUNNEL_KEY_ATTR_MAX (__OVS_TUNNEL_KEY_ATTR_MAX - 1) + /** * enum ovs_frag_type - IPv4 and IPv6 fragment type * @OVS_FRAG_TYPE_NONE: Packet is not a fragment. diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 596d637..22c5f39 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -436,6 +436,10 @@ static int execute_set_action(struct sk_buff *skb, skb->mark = nla_get_u32(nested_attr); break; + case OVS_KEY_ATTR_IPV4_TUNNEL: + OVS_CB(skb)->tun_key = nla_data(nested_attr); + break; + case OVS_KEY_ATTR_ETHERNET: err = set_eth_addr(skb, nla_data(nested_attr)); break; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f14816b..bbd3106 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -362,6 +362,14 @@ static int queue_gso_packets(struct net *net, int dp_ifindex, static size_t key_attr_size(void) { return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ @@ -600,8 +608,30 @@ static int validate_tp_port(const struct sw_flow_key *flow_key) return -EINVAL; } +static int validate_and_copy_set_tun(const struct nlattr *attr, + struct sw_flow_actions **sfa) +{ + struct ovs_key_ipv4_tunnel tun_key; + int err, start; + + err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key); + if (err) + return err; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); + if (start < 0) + return start; + + err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key)); + add_nested_action_end(*sfa, start); + + return err; +} + static int validate_set(const struct nlattr *a, - const struct sw_flow_key *flow_key) + const struct sw_flow_key *flow_key, + struct sw_flow_actions **sfa, + bool *set_tun) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -611,18 +641,27 @@ static int validate_set(const struct nlattr *a, return -EINVAL; if (key_type > OVS_KEY_ATTR_MAX || - nla_len(ovs_key) != ovs_key_lens[key_type]) + (ovs_key_lens[key_type] != nla_len(ovs_key) && + ovs_key_lens[key_type] != -1)) return -EINVAL; switch (key_type) { const struct ovs_key_ipv4 *ipv4_key; const struct ovs_key_ipv6 *ipv6_key; + int err; case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: case OVS_KEY_ATTR_ETHERNET: break; + case OVS_KEY_ATTR_TUNNEL: + *set_tun = true; + err = validate_and_copy_set_tun(a, sfa); + if (err) + return err; + break; + case OVS_KEY_ATTR_IPV4: if (flow_key->eth.type != htons(ETH_P_IP)) return -EINVAL; @@ -771,7 +810,7 @@ static int validate_and_copy_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SET: - err = validate_set(a, key); + err = validate_set(a, key, sfa, &skip_copy); if (err) return err; break; @@ -993,6 +1032,33 @@ static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) return err; } +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + struct nlattr *start; + int err; + + switch (key_type) { + case OVS_KEY_ATTR_IPV4_TUNNEL: + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!start) + return -EMSGSIZE; + + err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key)); + if (err) + return err; + nla_nest_end(skb, start); + break; + default: + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) + return -EMSGSIZE; + break; + } + + return 0; +} + static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb) { const struct nlattr *a; @@ -1002,6 +1068,12 @@ static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *s int type = nla_type(a); switch (type) { + case OVS_ACTION_ATTR_SET: + err = set_action_to_attr(a, skb); + if (err) + return err; + break; + case OVS_ACTION_ATTR_SAMPLE: err = sample_action_to_attr(a, skb); if (err) diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 16b8406..e88ebc2 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -88,9 +88,12 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. + * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the + * packet is not being tunneled. */ struct ovs_skb_cb { struct sw_flow *flow; + struct ovs_key_ipv4_tunnel *tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 940d4b8..976a8b7 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -603,6 +604,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, memset(key, 0, sizeof(*key)); key->phy.priority = skb->priority; + if (OVS_CB(skb)->tun_key) + memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); key->phy.in_port = in_port; key->phy.skb_mark = skb->mark; @@ -818,6 +821,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_TUNNEL] = -1, }; static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, @@ -955,6 +959,105 @@ static int parse_flow_nlattrs(const struct nlattr *attr, return 0; } +int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, + struct ovs_key_ipv4_tunnel *tun_key) +{ + struct nlattr *a; + int rem; + bool ttl = false; + + memset(tun_key, 0, sizeof(*tun_key)); + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_TOS] = 1, + [OVS_TUNNEL_KEY_ATTR_TTL] = 1, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, + [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + }; + + if (type > OVS_TUNNEL_KEY_ATTR_MAX || + ovs_tunnel_key_lens[type] != nla_len(a)) + return -EINVAL; + + switch (type) { + case OVS_TUNNEL_KEY_ATTR_ID: + tun_key->tun_id = nla_get_be64(a); + tun_key->tun_flags |= TUNNEL_KEY; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: + tun_key->ipv4_src = nla_get_be32(a); + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_DST: + tun_key->ipv4_dst = nla_get_be32(a); + break; + case OVS_TUNNEL_KEY_ATTR_TOS: + tun_key->ipv4_tos = nla_get_u8(a); + break; + case OVS_TUNNEL_KEY_ATTR_TTL: + tun_key->ipv4_ttl = nla_get_u8(a); + ttl = true; + break; + case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: + tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT; + break; + case OVS_TUNNEL_KEY_ATTR_CSUM: + tun_key->tun_flags |= TUNNEL_CSUM; + break; + default: + return -EINVAL; + + } + } + if (rem > 0) + return -EINVAL; + + if (!tun_key->ipv4_dst) + return -EINVAL; + + if (!ttl) + return -EINVAL; + + return 0; +} + +int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *tun_key) +{ + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + if (tun_key->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id)) + return -EMSGSIZE; + if (tun_key->ipv4_src && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src)) + return -EMSGSIZE; + if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst)) + return -EMSGSIZE; + if (tun_key->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + /** * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. * @swkey: receives the extracted flow key. @@ -997,6 +1100,14 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } + if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key); + if (err) + return err; + + attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); + } + /* Data attributes. */ if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) return -EINVAL; @@ -1135,17 +1246,21 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, const struct nlattr *attr) { + struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; const struct nlattr *nla; int rem; flow->key.phy.in_port = DP_MAX_PORTS; flow->key.phy.priority = 0; flow->key.phy.skb_mark = 0; + memset(tun_key, 0, sizeof(flow->key.tun_key)); nla_for_each_nested(nla, attr, rem) { int type = nla_type(nla); if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { + int err; + if (nla_len(nla) != ovs_key_lens[type]) return -EINVAL; @@ -1154,6 +1269,12 @@ int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, flow->key.phy.priority = nla_get_u32(nla); break; + case OVS_KEY_ATTR_TUNNEL: + err = ovs_ipv4_tun_from_nlattr(nla, tun_key); + if (err) + return err; + break; + case OVS_KEY_ATTR_IN_PORT: if (nla_get_u32(nla) >= DP_MAX_PORTS) return -EINVAL; @@ -1180,6 +1301,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) goto nla_put_failure; + if (swkey->tun_key.ipv4_dst && + ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key)) + goto nla_put_failure; + if (swkey->phy.in_port != DP_MAX_PORTS && nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) goto nla_put_failure; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index e370f62..aec5e43 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -40,7 +40,22 @@ struct sw_flow_actions { struct nlattr actions[]; }; +/* Used to memset ovs_key_ipv4_tunnel padding. */ +#define OVS_TUNNEL_KEY_SIZE \ + (offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) + \ + FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl)) + +struct ovs_key_ipv4_tunnel { + __be64 tun_id; + __be32 ipv4_src; + __be32 ipv4_dst; + u16 tun_flags; + u8 ipv4_tos; + u8 ipv4_ttl; +}; + struct sw_flow_key { + struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ @@ -179,5 +194,9 @@ u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len); struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; +int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, + struct ovs_key_ipv4_tunnel *tun_key); +int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *tun_key); #endif /* flow.h */ diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index e284c7e..98d3edb 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -67,7 +67,7 @@ static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netde static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb); + ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); return 0; } diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 40de815..5982f3f 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -51,7 +51,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb); + ovs_vport_receive(vport, skb, NULL); return; error: diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 176d449..413287a 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -325,7 +325,8 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) +void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + struct ovs_key_ipv4_tunnel *tun_key) { struct pcpu_tstats *stats; @@ -335,6 +336,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) stats->rx_bytes += skb->len; u64_stats_update_end(&stats->syncp); + OVS_CB(skb)->tun_key = tun_key; ovs_dp_process_received_packet(vport, skb); } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 293278c..2d961ae 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -184,7 +184,8 @@ static inline struct vport *vport_from_priv(const void *priv) return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *); +void ovs_vport_receive(struct vport *, struct sk_buff *, + struct ovs_key_ipv4_tunnel *); void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); /* List of statically compiled vport implementations. Don't forget to also -- cgit v0.10.2 From ffe3f4321745e743dd179ec2b12180c01ba0d3aa Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:23 -0700 Subject: openvswitch: Expand action buffer size. MAX_ACTIONS_BUFSIZE limits action list size, set tunnel action needs extra space on action list, for now increase max actions list limit. Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index aec5e43..bfe80b9 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -159,7 +159,7 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, const struct nlattr *attr); -#define MAX_ACTIONS_BUFSIZE (16 * 1024) +#define MAX_ACTIONS_BUFSIZE (32 * 1024) #define TBL_MIN_BUCKETS 1024 struct flow_table { -- cgit v0.10.2 From a3e82996a8874c4cfe8c7f1be4d552018d8cba7e Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:28 -0700 Subject: openvswitch: Optimize flow key match for non tunnel flows. Following patch adds start offset for sw_flow-key, so that we can skip tunneling information in key for non-tunnel flows. Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index bbd3106..f7e3a0d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -894,10 +894,9 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (err) goto err_flow_free; - err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]); + err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]); if (err) goto err_flow_free; - flow->hash = ovs_flow_hash(&flow->key, key_len); acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); err = PTR_ERR(acts); if (IS_ERR(acts)) @@ -1276,14 +1275,12 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) error = PTR_ERR(flow); goto err_unlock_ovs; } - flow->key = key; clear_stats(flow); rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ - flow->hash = ovs_flow_hash(&key, key_len); - ovs_flow_tbl_insert(table, flow); + ovs_flow_tbl_insert(table, flow, &key, key_len); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, info->snd_seq, diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 976a8b7..5c519b1 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -353,6 +353,14 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la return NULL; } +static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct hlist_head *head; + head = find_bucket(table, flow->hash); + hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; +} + static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new) { int old_ver; @@ -369,7 +377,7 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new head = flex_array_get(old->buckets, i); hlist_for_each_entry(flow, head, hash_node[old_ver]) - ovs_flow_tbl_insert(new, flow); + __flow_tbl_insert(new, flow); } old->keep_flows = true; } @@ -763,9 +771,18 @@ out: return error; } -u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len) +static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len) +{ + return jhash2((u32 *)((u8 *)key + key_start), + DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0); +} + +static int flow_key_start(struct sw_flow_key *key) { - return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0); + if (key->tun_key.ipv4_dst) + return 0; + else + return offsetof(struct sw_flow_key, phy); } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, @@ -773,28 +790,31 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, { struct sw_flow *flow; struct hlist_head *head; + u8 *_key; + int key_start; u32 hash; - hash = ovs_flow_hash(key, key_len); + key_start = flow_key_start(key); + hash = ovs_flow_hash(key, key_start, key_len); + _key = (u8 *) key + key_start; head = find_bucket(table, hash); hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { if (flow->hash == hash && - !memcmp(&flow->key, key, key_len)) { + !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) { return flow; } } return NULL; } -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + struct sw_flow_key *key, int key_len) { - struct hlist_head *head; - - head = find_bucket(table, flow->hash); - hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); - table->count++; + flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len); + memcpy(&flow->key, key, sizeof(flow->key)); + __flow_tbl_insert(table, flow); } void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) @@ -1235,6 +1255,7 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, /** * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. * @flow: Receives extracted in_port, priority, tun_key and skb_mark. + * @key_len: Length of key in @flow. Used for calculating flow hash. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. * @@ -1243,7 +1264,7 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, * get the metadata, that is, the parts of the flow key that cannot be * extracted from the packet itself. */ -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, const struct nlattr *attr) { struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; @@ -1289,6 +1310,10 @@ int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, } if (rem) return -EINVAL; + + flow->hash = ovs_flow_hash(&flow->key, + flow_key_start(&flow->key), key_len); + return 0; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index bfe80b9..999842f 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -156,7 +156,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, const struct nlattr *); -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, const struct nlattr *attr); #define MAX_ACTIONS_BUFSIZE (32 * 1024) @@ -188,9 +188,9 @@ void ovs_flow_tbl_deferred_destroy(struct flow_table *table); struct flow_table *ovs_flow_tbl_alloc(int new_size); struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow); +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + struct sw_flow_key *key, int key_len); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); -u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len); struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; -- cgit v0.10.2 From aa310701e787087dbfbccf1409982a96e16c57a6 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:33 -0700 Subject: openvswitch: Add gre tunnel support. Add gre vport implementation. Most of gre protocol processing is pushed to gre module. It make use of gre demultiplexer therefore it can co-exist with linux device based gre tunnels. Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index b15a445..c55efaa 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -164,6 +164,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_UNSPEC, OVS_VPORT_TYPE_NETDEV, /* network device */ OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ + OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ __OVS_VPORT_TYPE_MAX }; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d9ea33c..9fbc04a 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -19,6 +19,8 @@ config OPENVSWITCH which is able to accept configuration from a variety of sources and translate it into packet processing rules. + Open vSwitch GRE support depends on CONFIG_NET_IPGRE_DEMUX. + See http://openvswitch.org for more information and userspace utilities. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 15e7384..01bddb2 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -10,5 +10,6 @@ openvswitch-y := \ dp_notify.o \ flow.o \ vport.o \ + vport-gre.o \ vport-internal_dev.o \ - vport-netdev.o \ + vport-netdev.o diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index e88ebc2..a914864 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -122,6 +122,7 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; + struct vport_net vport_net; }; extern int ovs_net_id; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 999842f..66ef722 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -49,11 +49,27 @@ struct ovs_key_ipv4_tunnel { __be64 tun_id; __be32 ipv4_src; __be32 ipv4_dst; - u16 tun_flags; + __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; }; +static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, + const struct iphdr *iph, __be64 tun_id, + __be16 tun_flags) +{ + tun_key->tun_id = tun_id; + tun_key->ipv4_src = iph->saddr; + tun_key->ipv4_dst = iph->daddr; + tun_key->ipv4_tos = iph->tos; + tun_key->ipv4_ttl = iph->ttl; + tun_key->tun_flags = tun_flags; + + /* clear struct padding. */ + memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); +} + struct sw_flow_key { struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c new file mode 100644 index 0000000..3a8d190 --- /dev/null +++ b/net/openvswitch/vport-gre.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifdef CONFIG_NET_IPGRE_DEMUX +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 be64_get_low32(__be64 x) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)x; +#else + return (__force __be32)((__force u64)x >> 32); +#endif +} + +static __be16 filter_tnl_flags(__be16 flags) +{ + return flags & (TUNNEL_CSUM | TUNNEL_KEY); +} + +static struct sk_buff *__build_header(struct sk_buff *skb, + int tunnel_hlen) +{ + const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; + struct tnl_ptk_info tpi; + + skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); + if (IS_ERR(skb)) + return NULL; + + tpi.flags = filter_tnl_flags(tun_key->tun_flags); + tpi.proto = htons(ETH_P_TEB); + tpi.key = be64_get_low32(tun_key->tun_id); + tpi.seq = 0; + gre_build_header(skb, &tpi, tunnel_hlen); + + return skb; +} + +static __be64 key_to_tunnel_id(__be32 key, __be32 seq) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)((__force u64)seq << 32 | (__force u32)key); +#else + return (__force __be64)((__force u64)key << 32 | (__force u32)seq); +#endif +} + +/* Called with rcu_read_lock and BH disabled. */ +static int gre_rcv(struct sk_buff *skb, + const struct tnl_ptk_info *tpi) +{ + struct ovs_key_ipv4_tunnel tun_key; + struct ovs_net *ovs_net; + struct vport *vport; + __be64 key; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + if (unlikely(!vport)) + return PACKET_REJECT; + + key = key_to_tunnel_id(tpi->key, tpi->seq); + ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, + filter_tnl_flags(tpi->flags)); + + ovs_vport_receive(vport, skb, &tun_key); + return PACKET_RCVD; +} + +static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct flowi4 fl; + struct rtable *rt; + int min_headroom; + int tunnel_hlen; + __be16 df; + int err; + + if (unlikely(!OVS_CB(skb)->tun_key)) { + err = -EINVAL; + goto error; + } + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; + fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_GRE; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + tunnel_hlen + sizeof(struct iphdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + if (vlan_tx_tag_present(skb)) { + if (unlikely(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) { + err = -ENOMEM; + goto err_free_rt; + } + skb->vlan_tci = 0; + } + + /* Push Tunnel header. */ + skb = __build_header(skb, tunnel_hlen); + if (unlikely(!skb)) { + err = 0; + goto err_free_rt; + } + + df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + htons(IP_DF) : 0; + + skb->local_df = 1; + + return iptunnel_xmit(net, rt, skb, fl.saddr, + OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, + OVS_CB(skb)->tun_key->ipv4_tos, + OVS_CB(skb)->tun_key->ipv4_ttl, df); +err_free_rt: + ip_rt_put(rt); +error: + return err; +} + +static struct gre_cisco_protocol gre_protocol = { + .handler = gre_rcv, + .priority = 1, +}; + +static int gre_ports; +static int gre_init(void) +{ + int err; + + gre_ports++; + if (gre_ports > 1) + return 0; + + err = gre_cisco_register(&gre_protocol); + if (err) + pr_warn("cannot register gre protocol handler\n"); + + return err; +} + +static void gre_exit(void) +{ + gre_ports--; + if (gre_ports > 0) + return; + + gre_cisco_unregister(&gre_protocol); +} + +static const char *gre_get_name(const struct vport *vport) +{ + return vport_priv(vport); +} + +static struct vport *gre_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct ovs_net *ovs_net; + struct vport *vport; + int err; + + err = gre_init(); + if (err) + return ERR_PTR(err); + + ovs_net = net_generic(net, ovs_net_id); + if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { + vport = ERR_PTR(-EEXIST); + goto error; + } + + vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + goto error; + + strncpy(vport_priv(vport), parms->name, IFNAMSIZ); + rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); + return vport; + +error: + gre_exit(); + return vport; +} + +static void gre_tnl_destroy(struct vport *vport) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct ovs_net *ovs_net; + + ovs_net = net_generic(net, ovs_net_id); + + rcu_assign_pointer(ovs_net->vport_net.gre_vport, NULL); + ovs_vport_deferred_free(vport); + gre_exit(); +} + +const struct vport_ops ovs_gre_vport_ops = { + .type = OVS_VPORT_TYPE_GRE, + .create = gre_create, + .destroy = gre_tnl_destroy, + .get_name = gre_get_name, + .send = gre_tnl_send, +}; +#endif diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 413287a..f52dfb9 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -38,6 +38,10 @@ static const struct vport_ops *vport_ops_list[] = { &ovs_netdev_vport_ops, &ovs_internal_vport_ops, + +#ifdef CONFIG_NET_IPGRE_DEMUX + &ovs_gre_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -404,3 +408,18 @@ void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) spin_unlock(&vport->stats_lock); } + +static void free_vport_rcu(struct rcu_head *rcu) +{ + struct vport *vport = container_of(rcu, struct vport, rcu); + + ovs_vport_free(vport); +} + +void ovs_vport_deferred_free(struct vport *vport) +{ + if (!vport) + return; + + call_rcu(&vport->rcu, free_vport_rcu); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 2d961ae..376045c 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -34,6 +34,11 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ +/* The following definitions are for users of the vport subsytem: */ +struct vport_net { + struct vport __rcu *gre_vport; +}; + int ovs_vport_init(void); void ovs_vport_exit(void); @@ -152,6 +157,7 @@ enum vport_err_type { struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); +void ovs_vport_deferred_free(struct vport *vport); #define VPORT_ALIGN 8 @@ -192,6 +198,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); * add yours to the list at the top of vport.c. */ extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; +extern const struct vport_ops ovs_gre_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v0.10.2