summaryrefslogtreecommitdiff
path: root/net/netfilter/ipvs/ip_vs_xmit.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-02 20:38:27 (GMT)
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-02 20:38:27 (GMT)
commitaecdc33e111b2c447b622e287c6003726daa1426 (patch)
tree3e7657eae4b785e1a1fb5dfb225dbae0b2f0cfc6 /net/netfilter/ipvs/ip_vs_xmit.c
parenta20acf99f75e49271381d65db097c9763060a1e8 (diff)
parenta3a6cab5ea10cca64d036851fe0d932448f2fe4f (diff)
downloadlinux-fsl-qoriq-aecdc33e111b2c447b622e287c6003726daa1426.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller: 1) GRE now works over ipv6, from Dmitry Kozlov. 2) Make SCTP more network namespace aware, from Eric Biederman. 3) TEAM driver now works with non-ethernet devices, from Jiri Pirko. 4) Make openvswitch network namespace aware, from Pravin B Shelar. 5) IPV6 NAT implementation, from Patrick McHardy. 6) Server side support for TCP Fast Open, from Jerry Chu and others. 7) Packet BPF filter supports MOD and XOR, from Eric Dumazet and Daniel Borkmann. 8) Increate the loopback default MTU to 64K, from Eric Dumazet. 9) Use a per-task rather than per-socket page fragment allocator for outgoing networking traffic. This benefits processes that have very many mostly idle sockets, which is quite common. From Eric Dumazet. 10) Use up to 32K for page fragment allocations, with fallbacks to smaller sizes when higher order page allocations fail. Benefits are a) less segments for driver to process b) less calls to page allocator c) less waste of space. From Eric Dumazet. 11) Allow GRO to be used on GRE tunnels, from Eric Dumazet. 12) VXLAN device driver, one way to handle VLAN issues such as the limitation of 4096 VLAN IDs yet still have some level of isolation. From Stephen Hemminger. 13) As usual there is a large boatload of driver changes, with the scale perhaps tilted towards the wireless side this time around. Fix up various fairly trivial conflicts, mostly caused by the user namespace changes. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1012 commits) hyperv: Add buffer for extended info after the RNDIS response message. hyperv: Report actual status in receive completion packet hyperv: Remove extra allocated space for recv_pkt_list elements hyperv: Fix page buffer handling in rndis_filter_send_request() hyperv: Fix the missing return value in rndis_filter_set_packet_filter() hyperv: Fix the max_xfer_size in RNDIS initialization vxlan: put UDP socket in correct namespace vxlan: Depend on CONFIG_INET sfc: Fix the reported priorities of different filter types sfc: Remove EFX_FILTER_FLAG_RX_OVERRIDE_IP sfc: Fix loopback self-test with separate_tx_channels=1 sfc: Fix MCDI structure field lookup sfc: Add parentheses around use of bitfield macro arguments sfc: Fix null function pointer in efx_sriov_channel_type vxlan: virtual extensible lan igmp: export symbol ip_mc_leave_group netlink: add attributes to fdb interface tg3: unconditionally select HWMON support when tg3 is enabled. Revert "net: ti cpsw ethernet: allow reading phy interface mode from DT" gre: fix sparse warning ...
Diffstat (limited to 'net/netfilter/ipvs/ip_vs_xmit.c')
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c111
1 files changed, 79 insertions, 32 deletions
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 65b616a..56f6d5d 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -49,6 +49,7 @@ enum {
IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
* local
*/
+ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
};
/*
@@ -84,6 +85,58 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
return dst;
}
+static inline bool
+__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
+{
+ if (IP6CB(skb)->frag_max_size) {
+ /* frag_max_size tell us that, this packet have been
+ * defragmented by netfilter IPv6 conntrack module.
+ */
+ if (IP6CB(skb)->frag_max_size > mtu)
+ return true; /* largest fragment violate MTU */
+ }
+ else if (skb->len > mtu && !skb_is_gso(skb)) {
+ return true; /* Packet size violate MTU size */
+ }
+ return false;
+}
+
+/* Get route to daddr, update *saddr, optionally bind route to saddr */
+static struct rtable *do_output_route4(struct net *net, __be32 daddr,
+ u32 rtos, int rt_mode, __be32 *saddr)
+{
+ struct flowi4 fl4;
+ struct rtable *rt;
+ int loop = 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
+ fl4.flowi4_tos = rtos;
+
+retry:
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ /* Invalid saddr ? */
+ if (PTR_ERR(rt) == -EINVAL && *saddr &&
+ rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
+ *saddr = 0;
+ flowi4_update_output(&fl4, 0, rtos, daddr, 0);
+ goto retry;
+ }
+ IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
+ return NULL;
+ } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ ip_rt_put(rt);
+ *saddr = fl4.saddr;
+ flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr);
+ loop++;
+ goto retry;
+ }
+ *saddr = fl4.saddr;
+ return rt;
+}
+
/* Get route to destination or remote server */
static struct rtable *
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
@@ -98,20 +151,13 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
spin_lock(&dest->dst_lock);
if (!(rt = (struct rtable *)
__ip_vs_dst_check(dest, rtos))) {
- struct flowi4 fl4;
-
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = dest->addr.ip;
- fl4.flowi4_tos = rtos;
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt)) {
+ rt = do_output_route4(net, dest->addr.ip, rtos,
+ rt_mode, &dest->dst_saddr.ip);
+ if (!rt) {
spin_unlock(&dest->dst_lock);
- IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &dest->addr.ip);
return NULL;
}
__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
- dest->dst_saddr.ip = fl4.saddr;
IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
"rtos=%X\n",
&dest->addr.ip, &dest->dst_saddr.ip,
@@ -122,19 +168,17 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
*ret_saddr = dest->dst_saddr.ip;
spin_unlock(&dest->dst_lock);
} else {
- struct flowi4 fl4;
+ __be32 saddr = htonl(INADDR_ANY);
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = daddr;
- fl4.flowi4_tos = rtos;
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt)) {
- IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &daddr);
+ /* For such unconfigured boxes avoid many route lookups
+ * for performance reasons because we do not remember saddr
+ */
+ rt_mode &= ~IP_VS_RT_MODE_CONNECT;
+ rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr);
+ if (!rt)
return NULL;
- }
if (ret_saddr)
- *ret_saddr = fl4.saddr;
+ *ret_saddr = saddr;
}
local = rt->rt_flags & RTCF_LOCAL;
@@ -331,6 +375,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
old_dst = dest->dst_cache;
dest->dst_cache = NULL;
dst_release(old_dst);
+ dest->dst_saddr.ip = 0;
}
#define IP_VS_XMIT_TUNNEL(skb, cp) \
@@ -462,7 +507,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu && !skb_is_gso(skb)) {
+ if (__mtu_check_toobig_v6(skb, mtu)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -683,7 +728,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu && !skb_is_gso(skb)) {
+ if (__mtu_check_toobig_v6(skb, mtu)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -766,12 +811,13 @@ int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
struct rtable *rt; /* Route to the other host */
__be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = ip_hdr(skb);
u8 tos = old_iph->tos;
- __be16 df = old_iph->frag_off;
+ __be16 df;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int mtu;
@@ -781,7 +827,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL,
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT,
&saddr)))
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
@@ -796,13 +843,13 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
goto tx_error_put;
}
- if (skb_dst(skb))
+ if (rt_is_output_route(skb_rtable(skb)))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
- df |= (old_iph->frag_off & htons(IP_DF));
+ /* Copy DF, reset fragment offset and MF */
+ df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;
- if ((old_iph->frag_off & htons(IP_DF) &&
- mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
+ if (df && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
@@ -915,8 +962,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
- if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
- !skb_is_gso(skb)) {
+ /* MTU checking: Notice that 'mtu' have been adjusted before hand */
+ if (__mtu_check_toobig_v6(skb, mtu)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -1082,7 +1129,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (__mtu_check_toobig_v6(skb, mtu)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -1318,7 +1365,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu && !skb_is_gso(skb)) {
+ if (__mtu_check_toobig_v6(skb, mtu)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);