From e6f225ebb7c35fe30fdf8608927c5cf8fce6de7d Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 19 Sep 2008 20:41:56 +0200 Subject: ipvs: Restrict sync message to 255 connections The nr_conns variable in the sync message header is only eight bits wide and will overflow on interfaces with a large MTU. As a result the backup won't parse all connections contained in the sync buffer. On regular ethernet with an MTU of 1500 this isn't a problem, because we can't overflow the value, but consider jumbo frames being used on a cross-over connection between both directors. We now restrict the size of the sync buffer, so that we never put more than 255 connections into a single sync buffer. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 28237a5..de5e7e1 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -99,6 +100,7 @@ struct ip_vs_sync_thread_data { */ #define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ struct ip_vs_sync_mesg { __u8 nr_conns; @@ -516,8 +518,8 @@ static int set_sync_mesg_maxlen(int sync_state) num = (dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr) - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = - SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; + sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { -- cgit v0.10.2 From 8d5803bf6fbe5264000afc8c34bff08e8ecc023b Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Sat, 20 Sep 2008 11:48:33 +0200 Subject: ipvs: Fix unused label warning Signed-off-by: Sven Wegener Signed-off-by: Simon Horman diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 771551d..0302cf3 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1330,7 +1330,9 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) out_unlock: write_unlock_bh(&__ip_vs_svc_lock); +#ifdef CONFIG_IP_VS_IPV6 out: +#endif if (old_sched) ip_vs_scheduler_put(old_sched); -- cgit v0.10.2 From cb7f6a7b716e801097b564dec3ccb58d330aef56 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 19 Sep 2008 12:32:57 +0200 Subject: IPVS: Move IPVS to net/netfilter/ipvs Since IPVS now has partial IPv6 support, this patch moves IPVS from net/ipv4/ipvs to net/netfilter/ipvs. It's a result of: $ git mv net/ipv4/ipvs net/netfilter and adapting the relevant Kconfigs/Makefiles to the new path. Signed-off-by: Julius Volz Signed-off-by: Simon Horman diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 591ea23..691268f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -630,5 +630,3 @@ config TCP_MD5SIG If unsure, say N. -source "net/ipv4/ipvs/Kconfig" - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ad40ef3..80ff87c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ -obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig deleted file mode 100644 index de6004d..0000000 --- a/net/ipv4/ipvs/Kconfig +++ /dev/null @@ -1,239 +0,0 @@ -# -# IP Virtual Server configuration -# -menuconfig IP_VS - tristate "IP virtual server support (EXPERIMENTAL)" - depends on NETFILTER - ---help--- - IP Virtual Server support will let you build a high-performance - virtual server based on cluster of two or more real servers. This - option must be enabled for at least one of the clustered computers - that will take care of intercepting incoming connections to a - single IP address and scheduling them to real servers. - - Three request dispatching techniques are implemented, they are - virtual server via NAT, virtual server via tunneling and virtual - server via direct routing. The several scheduling algorithms can - be used to choose which server the connection is directed to, - thus load balancing can be achieved among the servers. For more - information and its administration program, please visit the - following URL: . - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -if IP_VS - -config IP_VS_IPV6 - bool "IPv6 support for IPVS (DANGEROUS)" - depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) - ---help--- - Add IPv6 support to IPVS. This is incomplete and might be dangerous. - - Say N if unsure. - -config IP_VS_DEBUG - bool "IP virtual server debugging" - ---help--- - Say Y here if you want to get additional messages useful in - debugging the IP virtual server code. You can change the debug - level in /proc/sys/net/ipv4/vs/debug_level - -config IP_VS_TAB_BITS - int "IPVS connection table size (the Nth power of 2)" - range 8 20 - default 12 - ---help--- - The IPVS connection hash table uses the chaining scheme to handle - hash collisions. Using a big IPVS connection hash table will greatly - reduce conflicts when there are hundreds of thousands of connections - in the hash table. - - Note the table size must be power of 2. The table size will be the - value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). - - Another note that each connection occupies 128 bytes effectively and - each hash entry uses 8 bytes, so you can estimate how much memory is - needed for your box. - -comment "IPVS transport protocol load balancing support" - -config IP_VS_PROTO_TCP - bool "TCP load balancing support" - ---help--- - This option enables support for load balancing TCP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_UDP - bool "UDP load balancing support" - ---help--- - This option enables support for load balancing UDP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_AH_ESP - bool - depends on UNDEFINED - -config IP_VS_PROTO_ESP - bool "ESP load balancing support" - select IP_VS_PROTO_AH_ESP - ---help--- - This option enables support for load balancing ESP (Encapsulation - Security Payload) transport protocol. Say Y if unsure. - -config IP_VS_PROTO_AH - bool "AH load balancing support" - select IP_VS_PROTO_AH_ESP - ---help--- - This option enables support for load balancing AH (Authentication - Header) transport protocol. Say Y if unsure. - -comment "IPVS scheduler" - -config IP_VS_RR - tristate "round-robin scheduling" - ---help--- - The robin-robin scheduling algorithm simply directs network - connections to different real servers in a round-robin manner. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WRR - tristate "weighted round-robin scheduling" - ---help--- - The weighted robin-robin scheduling algorithm directs network - connections to different real servers based on server weights - in a round-robin manner. Servers with higher weights receive - new connections first than those with less weights, and servers - with higher weights get more connections than those with less - weights and servers with equal weights get equal connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LC - tristate "least-connection scheduling" - ---help--- - The least-connection scheduling algorithm directs network - connections to the server with the least number of active - connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WLC - tristate "weighted least-connection scheduling" - ---help--- - The weighted least-connection scheduling algorithm directs network - connections to the server with the least active connections - normalized by the server weight. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLC - tristate "locality-based least-connection scheduling" - ---help--- - The locality-based least-connection scheduling algorithm is for - destination IP load balancing. It is usually used in cache cluster. - This algorithm usually directs packet destined for an IP address to - its server if the server is alive and under load. If the server is - overloaded (its active connection numbers is larger than its weight) - and there is a server in its half load, then allocate the weighted - least-connection server to this IP address. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLCR - tristate "locality-based least-connection with replication scheduling" - ---help--- - The locality-based least-connection with replication scheduling - algorithm is also for destination IP load balancing. It is - usually used in cache cluster. It differs from the LBLC scheduling - as follows: the load balancer maintains mappings from a target - to a set of server nodes that can serve the target. Requests for - a target are assigned to the least-connection node in the target's - server set. If all the node in the server set are over loaded, - it picks up a least-connection node in the cluster and adds it - in the sever set for the target. If the server set has not been - modified for the specified time, the most loaded node is removed - from the server set, in order to avoid high degree of replication. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_DH - tristate "destination hashing scheduling" - ---help--- - The destination hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their destination IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SH - tristate "source hashing scheduling" - ---help--- - The source hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their source IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SED - tristate "shortest expected delay scheduling" - ---help--- - The shortest expected delay scheduling algorithm assigns network - connections to the server with the shortest expected delay. The - expected delay that the job will experience is (Ci + 1) / Ui if - sent to the ith server, in which Ci is the number of connections - on the ith server and Ui is the fixed service rate (weight) - of the ith server. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_NQ - tristate "never queue scheduling" - ---help--- - The never queue scheduling algorithm adopts a two-speed model. - When there is an idle server available, the job will be sent to - the idle server, instead of waiting for a fast one. When there - is no idle server available, the job will be sent to the server - that minimize its expected delay (The Shortest Expected Delay - scheduling algorithm). - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -comment 'IPVS application helper' - -config IP_VS_FTP - tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP - ---help--- - FTP is a protocol that transfers IP address and/or port number in - the payload. In the virtual server via Network Address Translation, - the IP address and port number of real servers cannot be sent to - clients in ftp connections directly, so FTP protocol helper is - required for tracking the connection and mangling it back to that of - virtual service. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -endif # IP_VS diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile deleted file mode 100644 index 73a46fe..0000000 --- a/net/ipv4/ipvs/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -# -# Makefile for the IPVS modules on top of IPv4. -# - -# IPVS transport protocol load balancing support -ip_vs_proto-objs-y := -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o - -ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ - ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) - - -# IPVS core -obj-$(CONFIG_IP_VS) += ip_vs.o - -# IPVS schedulers -obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o -obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o -obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o -obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o -obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o -obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o -obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o -obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o -obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o -obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o - -# IPVS application helpers -obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c deleted file mode 100644 index 201b8ea..0000000 --- a/net/ipv4/ipvs/ip_vs_app.c +++ /dev/null @@ -1,622 +0,0 @@ -/* - * ip_vs_app.c: Application module support for IPVS - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference - * is that ip_vs_app module handles the reverse direction (incoming requests - * and outgoing responses). - * - * IP_MASQ_APP application masquerading module - * - * Author: Juan Jose Ciarlante, - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -EXPORT_SYMBOL(register_ip_vs_app); -EXPORT_SYMBOL(unregister_ip_vs_app); -EXPORT_SYMBOL(register_ip_vs_app_inc); - -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); -static DEFINE_MUTEX(__ip_vs_app_mutex); - - -/* - * Get an ip_vs_app object - */ -static inline int ip_vs_app_get(struct ip_vs_app *app) -{ - return try_module_get(app->module); -} - - -static inline void ip_vs_app_put(struct ip_vs_app *app) -{ - module_put(app->module); -} - - -/* - * Allocate/initialize app incarnation and register it in proto apps. - */ -static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - struct ip_vs_protocol *pp; - struct ip_vs_app *inc; - int ret; - - if (!(pp = ip_vs_proto_get(proto))) - return -EPROTONOSUPPORT; - - if (!pp->unregister_app) - return -EOPNOTSUPP; - - inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); - if (!inc) - return -ENOMEM; - INIT_LIST_HEAD(&inc->p_list); - INIT_LIST_HEAD(&inc->incs_list); - inc->app = app; - inc->port = htons(port); - atomic_set(&inc->usecnt, 0); - - if (app->timeouts) { - inc->timeout_table = - ip_vs_create_timeout_table(app->timeouts, - app->timeouts_size); - if (!inc->timeout_table) { - ret = -ENOMEM; - goto out; - } - } - - ret = pp->register_app(inc); - if (ret) - goto out; - - list_add(&inc->a_list, &app->incs_list); - IP_VS_DBG(9, "%s application %s:%u registered\n", - pp->name, inc->name, inc->port); - - return 0; - - out: - kfree(inc->timeout_table); - kfree(inc); - return ret; -} - - -/* - * Release app incarnation - */ -static void -ip_vs_app_inc_release(struct ip_vs_app *inc) -{ - struct ip_vs_protocol *pp; - - if (!(pp = ip_vs_proto_get(inc->protocol))) - return; - - if (pp->unregister_app) - pp->unregister_app(inc); - - IP_VS_DBG(9, "%s App %s:%u unregistered\n", - pp->name, inc->name, inc->port); - - list_del(&inc->a_list); - - kfree(inc->timeout_table); - kfree(inc); -} - - -/* - * Get reference to app inc (only called from softirq) - * - */ -int ip_vs_app_inc_get(struct ip_vs_app *inc) -{ - int result; - - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); - return result; -} - - -/* - * Put the app inc (only called from timer or net softirq) - */ -void ip_vs_app_inc_put(struct ip_vs_app *inc) -{ - ip_vs_app_put(inc->app); - atomic_dec(&inc->usecnt); -} - - -/* - * Register an application incarnation in protocol applications - */ -int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - int result; - - mutex_lock(&__ip_vs_app_mutex); - - result = ip_vs_app_inc_new(app, proto, port); - - mutex_unlock(&__ip_vs_app_mutex); - - return result; -} - - -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct ip_vs_app *app) -{ - /* increase the module use count */ - ip_vs_use_count_inc(); - - mutex_lock(&__ip_vs_app_mutex); - - list_add(&app->a_list, &ip_vs_app_list); - - mutex_unlock(&__ip_vs_app_mutex); - - return 0; -} - - -/* - * ip_vs_app unregistration routine - * We are sure there are no app incarnations attached to services - */ -void unregister_ip_vs_app(struct ip_vs_app *app) -{ - struct ip_vs_app *inc, *nxt; - - mutex_lock(&__ip_vs_app_mutex); - - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); - } - - list_del(&app->a_list); - - mutex_unlock(&__ip_vs_app_mutex); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - - -/* - * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) - */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) -{ - return pp->app_conn_bind(cp); -} - - -/* - * Unbind cp from application incarnation (called by cp destructor) - */ -void ip_vs_unbind_app(struct ip_vs_conn *cp) -{ - struct ip_vs_app *inc = cp->app; - - if (!inc) - return; - - if (inc->unbind_conn) - inc->unbind_conn(inc, cp); - if (inc->done_conn) - inc->done_conn(inc, cp); - ip_vs_app_inc_put(inc); - cp->app = NULL; -} - - -/* - * Fixes th->seq based on ip_vs_seq info. - */ -static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 seq = ntohl(th->seq); - - /* - * Adjust seq with delta-offset for all packets after - * the most recent resized pkt seq and with previous_delta offset - * for all packets before most recent resized pkt seq. - */ - if (vseq->delta || vseq->previous_delta) { - if(after(seq, vseq->init_seq)) { - th->seq = htonl(seq + vseq->delta); - IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", - vseq->delta); - } else { - th->seq = htonl(seq + vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " - "(%d) to seq\n", vseq->previous_delta); - } - } -} - - -/* - * Fixes th->ack_seq based on ip_vs_seq info. - */ -static inline void -vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 ack_seq = ntohl(th->ack_seq); - - /* - * Adjust ack_seq with delta-offset for - * the packets AFTER most recent resized pkt has caused a shift - * for packets before most recent resized pkt, use previous_delta - */ - if (vseq->delta || vseq->previous_delta) { - /* since ack_seq is the number of octet that is expected - to receive next, so compare it with init_seq+delta */ - if(after(ack_seq, vseq->init_seq+vseq->delta)) { - th->ack_seq = htonl(ack_seq - vseq->delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " - "(%d) from ack_seq\n", vseq->delta); - - } else { - th->ack_seq = htonl(ack_seq - vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " - "previous_delta (%d) from ack_seq\n", - vseq->previous_delta); - } - } -} - - -/* - * Updates ip_vs_seq if pkt has been resized - * Assumes already checked proto==IPPROTO_TCP and diff!=0. - */ -static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) -{ - /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); - if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { - vseq->previous_delta = vseq->delta; - vseq->delta += diff; - vseq->init_seq = seq; - cp->flags |= flag; - } - spin_unlock(&cp->lock); -} - -static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_seq(&cp->out_seq, th); - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_ack_seq(&cp->in_seq, th); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - if (!app->pkt_out(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->out_seq, - IP_VS_CONN_F_OUT_SEQ, seq, diff); - - return 1; -} - -/* - * Output pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL - * returns false if it can't handle packet (oom) - */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - return app->pkt_out(app, cp, skb, NULL); -} - - -static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_seq(&cp->in_seq, th); - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_ack_seq(&cp->out_seq, th); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - if (!app->pkt_in(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->in_seq, - IP_VS_CONN_F_IN_SEQ, seq, diff); - - return 1; -} - -/* - * Input pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL. - * returns false if can't handle packet (oom). - */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - return app->pkt_in(app, cp, skb, NULL); -} - - -#ifdef CONFIG_PROC_FS -/* - * /proc/net/ip_vs_app entry function - */ - -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) -{ - struct ip_vs_app *app, *inc; - - list_for_each_entry(app, &ip_vs_app_list, a_list) { - list_for_each_entry(inc, &app->incs_list, a_list) { - if (pos-- == 0) - return inc; - } - } - return NULL; - -} - -static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) -{ - mutex_lock(&__ip_vs_app_mutex); - - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; -} - -static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_app *inc, *app; - struct list_head *e; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); - - inc = v; - app = inc->app; - - if ((e = inc->a_list.next) != &app->incs_list) - return list_entry(e, struct ip_vs_app, a_list); - - /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { - app = list_entry(e, struct ip_vs_app, a_list); - list_for_each_entry(inc, &app->incs_list, a_list) { - return inc; - } - } - return NULL; -} - -static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) -{ - mutex_unlock(&__ip_vs_app_mutex); -} - -static int ip_vs_app_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "prot port usecnt name\n"); - else { - const struct ip_vs_app *inc = v; - - seq_printf(seq, "%-3s %-7u %-6d %-17s\n", - ip_vs_proto_name(inc->protocol), - ntohs(inc->port), - atomic_read(&inc->usecnt), - inc->name); - } - return 0; -} - -static const struct seq_operations ip_vs_app_seq_ops = { - .start = ip_vs_app_seq_start, - .next = ip_vs_app_seq_next, - .stop = ip_vs_app_seq_stop, - .show = ip_vs_app_seq_show, -}; - -static int ip_vs_app_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_app_seq_ops); -} - -static const struct file_operations ip_vs_app_fops = { - .owner = THIS_MODULE, - .open = ip_vs_app_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) -{ - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); - - LeaveFunction(9); - return 0; -} - - -int __init ip_vs_app_init(void) -{ - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; -} - - -void ip_vs_app_cleanup(void) -{ - proc_net_remove(&init_net, "ip_vs_app"); -} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c deleted file mode 100644 index 9a24332..0000000 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ /dev/null @@ -1,1110 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. Many code here is taken from IP MASQ code of kernel 2.2. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include /* for proc_net_* */ -#include -#include -#include - -#include -#include - - -/* - * Connection hash table: for input and output packets lookups of IPVS - */ -static struct list_head *ip_vs_conn_tab; - -/* SLAB cache for IPVS connections */ -static struct kmem_cache *ip_vs_conn_cachep __read_mostly; - -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - -/* counter for no client port connections */ -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); - -/* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; - -/* - * Fine locking granularity for big connection hash table - */ -#define CT_LOCKARRAY_BITS 4 -#define CT_LOCKARRAY_SIZE (1<ip, (__force u32)port, proto, - ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; -} - - -/* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. - * returns bool success. - */ -static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - list_add(&cp->c_list, &ip_vs_conn_tab[hash]); - cp->flags |= IP_VS_CONN_F_HASHED; - atomic_inc(&cp->refcnt); - ret = 1; - } else { - IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - ret = 0; - } - - ct_write_unlock(hash); - - return ret; -} - - -/* - * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. - */ -static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (cp->flags & IP_VS_CONN_F_HASHED) { - list_del(&cp->c_list); - cp->flags &= ~IP_VS_CONN_F_HASHED; - atomic_dec(&cp->refcnt); - ret = 1; - } else - ret = 0; - - ct_write_unlock(hash); - - return ret; -} - - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) - */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); - return cp; - } - } - - ct_read_unlock(hash); - - return NULL; -} - -struct ip_vs_conn *ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - struct ip_vs_conn *cp; - - cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, - d_port); - - IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - cp ? "hit" : "not hit"); - - return cp; -} - -/* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - goto out; - } - } - cp = NULL; - - out: - ct_read_unlock(hash); - - IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - cp ? "hit" : "not hit"); - - return cp; -} - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp, *ret=NULL; - - /* - * Check for "full" addressed entries - */ - hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, d_addr, &cp->caddr) && - ip_vs_addr_equal(af, s_addr, &cp->daddr) && - d_port == cp->cport && s_port == cp->dport && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ret = cp; - break; - } - } - - ct_read_unlock(hash); - - IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - ret ? "hit" : "not hit"); - - return ret; -} - - -/* - * Put back the conn and restart its timer with its timeout - */ -void ip_vs_conn_put(struct ip_vs_conn *cp) -{ - /* reset it expire in its timeout */ - mod_timer(&cp->timer, jiffies+cp->timeout); - - __ip_vs_conn_put(cp); -} - - -/* - * Fill a no_client_port connection with a client port number - */ -void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) -{ - if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { - atomic_dec(&ip_vs_conn_no_cport_cnt); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } - spin_unlock(&cp->lock); - - /* hash on new dport */ - ip_vs_conn_hash(cp); - } -} - - -/* - * Bind a connection entry with the corresponding packet_xmit. - * Called by ip_vs_conn_new. - */ -static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit; - break; - } -} - -#ifdef CONFIG_IP_VS_IPV6 -static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit_v6; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit_v6; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit_v6; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit_v6; - break; - } -} -#endif - - -static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) -{ - return atomic_read(&dest->activeconns) - + atomic_read(&dest->inactconns); -} - -/* - * Bind a connection entry with a virtual service destination - * Called just after a new connection entry is created. - */ -static inline void -ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) -{ - /* if dest is NULL, then return directly */ - if (!dest) - return; - - /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); - - /* Bind with the destination and its corresponding transmitter */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) - /* if the connection is not template and is created - * by sync, preserve the activity flag. - */ - cp->flags |= atomic_read(&dest->conn_flags) & - (~IP_VS_CONN_F_INACTIVE); - else - cp->flags |= atomic_read(&dest->conn_flags); - cp->dest = dest; - - IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " - "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) - atomic_inc(&dest->activeconns); - else - atomic_inc(&dest->inactconns); - } else { - /* It is a persistent connection/template, so increase - the peristent connection counter */ - atomic_inc(&dest->persistconns); - } - - if (dest->u_threshold != 0 && - ip_vs_dest_totalconns(dest) >= dest->u_threshold) - dest->flags |= IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Check if there is a destination for the connection, if so - * bind the connection to the destination. - */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest; - - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, - &cp->vaddr, cp->vport, - cp->protocol); - ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; -} - - -/* - * Unbind a connection entry with its VS destination - * Called by the ip_vs_conn_expire function. - */ -static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest = cp->dest; - - if (!dest) - return; - - IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " - "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so decrease the inactconns - or activeconns counter */ - if (cp->flags & IP_VS_CONN_F_INACTIVE) { - atomic_dec(&dest->inactconns); - } else { - atomic_dec(&dest->activeconns); - } - } else { - /* It is a persistent connection/template, so decrease - the peristent connection counter */ - atomic_dec(&dest->persistconns); - } - - if (dest->l_threshold != 0) { - if (ip_vs_dest_totalconns(dest) < dest->l_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else if (dest->u_threshold != 0) { - if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } - - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); -} - - -/* - * Checking if the destination of a connection template is available. - * If available, return 1, otherwise invalidate this connection - * template and return 0. - */ -int ip_vs_check_template(struct ip_vs_conn *ct) -{ - struct ip_vs_dest *dest = ct->dest; - - /* - * Checking the dest server status. - */ - if ((dest == NULL) || - !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && - (atomic_read(&dest->weight) == 0))) { - IP_VS_DBG_BUF(9, "check_template: dest not available for " - "protocol %s s:%s:%d v:%s:%d " - "-> d:%s:%d\n", - ip_vs_proto_name(ct->protocol), - IP_VS_DBG_ADDR(ct->af, &ct->caddr), - ntohs(ct->cport), - IP_VS_DBG_ADDR(ct->af, &ct->vaddr), - ntohs(ct->vport), - IP_VS_DBG_ADDR(ct->af, &ct->daddr), - ntohs(ct->dport)); - - /* - * Invalidate the connection template - */ - if (ct->vport != htons(0xffff)) { - if (ip_vs_conn_unhash(ct)) { - ct->dport = htons(0xffff); - ct->vport = htons(0xffff); - ct->cport = 0; - ip_vs_conn_hash(ct); - } - } - - /* - * Simply decrease the refcnt of the template, - * don't restart its timer. - */ - atomic_dec(&ct->refcnt); - return 0; - } - return 1; -} - -static void ip_vs_conn_expire(unsigned long data) -{ - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - - cp->timeout = 60*HZ; - - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); - - /* - * do I control anybody? - */ - if (atomic_read(&cp->n_control)) - goto expire_later; - - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { - /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); - - /* does anybody control me? */ - if (cp->control) - ip_vs_control_del(cp); - - if (unlikely(cp->app != NULL)) - ip_vs_unbind_app(cp); - ip_vs_unbind_dest(cp); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) - atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); - return; - } - - /* hash it back to the table */ - ip_vs_conn_hash(cp); - - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, - atomic_read(&cp->n_control)); - - ip_vs_conn_put(cp); -} - - -void ip_vs_conn_expire_now(struct ip_vs_conn *cp) -{ - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); -} - - -/* - * Create a new connection entry and hash it into the ip_vs_conn_tab - */ -struct ip_vs_conn * -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) -{ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); - if (cp == NULL) { - IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); - return NULL; - } - - INIT_LIST_HEAD(&cp->c_list); - setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->af = af; - cp->protocol = proto; - ip_vs_addr_copy(af, &cp->caddr, caddr); - cp->cport = cport; - ip_vs_addr_copy(af, &cp->vaddr, vaddr); - cp->vport = vport; - ip_vs_addr_copy(af, &cp->daddr, daddr); - cp->dport = dport; - cp->flags = flags; - spin_lock_init(&cp->lock); - - /* - * Set the entry is referenced by the current thread before hashing - * it in the table, so that other thread run ip_vs_random_dropentry - * but cannot drop this entry. - */ - atomic_set(&cp->refcnt, 1); - - atomic_set(&cp->n_control, 0); - atomic_set(&cp->in_pkts, 0); - - atomic_inc(&ip_vs_conn_count); - if (flags & IP_VS_CONN_F_NO_CPORT) - atomic_inc(&ip_vs_conn_no_cport_cnt); - - /* Bind the connection with a destination server */ - ip_vs_bind_dest(cp, dest); - - /* Set its state and timeout */ - cp->state = 0; - cp->timeout = 3*HZ; - - /* Bind its packet transmitter */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ip_vs_bind_xmit_v6(cp); - else -#endif - ip_vs_bind_xmit(cp); - - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); - - /* Hash it in the ip_vs_conn_tab finally */ - ip_vs_conn_hash(cp); - - return cp; -} - - -/* - * /proc/net/ip_vs_conn entries - */ -#ifdef CONFIG_PROC_FS - -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) -{ - int idx; - struct ip_vs_conn *cp; - - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - } - ct_read_unlock_bh(idx); - } - - return NULL; -} - -static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) -{ - seq->private = NULL; - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; -} - -static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; - int idx; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); - - /* more on same hash chain? */ - if ((e = cp->c_list.next) != l) - return list_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - - while (++idx < IP_VS_CONN_TAB_SIZE) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - ct_read_unlock_bh(idx); - } - seq->private = NULL; - return NULL; -} - -static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) -{ - struct list_head *l = seq->private; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); -} - -static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); - else { - const struct ip_vs_conn *cp = v; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - seq_printf(seq, - "%-3s " NIP6_FMT " %04X " NIP6_FMT - " %04X " NIP6_FMT " %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - NIP6(cp->caddr.in6), ntohs(cp->cport), - NIP6(cp->vaddr.in6), ntohs(cp->vport), - NIP6(cp->daddr.in6), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - else -#endif - seq_printf(seq, - "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr.ip), ntohs(cp->cport), - ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_seq_show, -}; - -static int ip_vs_conn_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_seq_ops); -} - -static const struct file_operations ip_vs_conn_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const char *ip_vs_origin_name(unsigned flags) -{ - if (flags & IP_VS_CONN_F_SYNC) - return "SYNC"; - else - return "LOCAL"; -} - -static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); - else { - const struct ip_vs_conn *cp = v; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - seq_printf(seq, - "%-3s " NIP6_FMT " %04X " NIP6_FMT - " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - NIP6(cp->caddr.in6), ntohs(cp->cport), - NIP6(cp->vaddr.in6), ntohs(cp->vport), - NIP6(cp->daddr.in6), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - else -#endif - seq_printf(seq, - "%-3s %08X %04X %08X %04X " - "%08X %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr.ip), ntohs(cp->cport), - ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_sync_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_sync_seq_show, -}; - -static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_sync_seq_ops); -} - -static const struct file_operations ip_vs_conn_sync_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_sync_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif - - -/* - * Randomly drop connection entries before running out of memory - */ -static inline int todrop_entry(struct ip_vs_conn *cp) -{ - /* - * The drop rate array needs tuning for real environments. - * Called from timer bh only => no locking - */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; - int i; - - /* if the conn entry hasn't lasted for 60 seconds, don't drop it. - This will leave enough time for normal connection to get - through. */ - if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) - return 0; - - /* Don't drop the entry if its number of incoming packets is not - located in [0, 8] */ - i = atomic_read(&cp->in_pkts); - if (i > 8 || i < 0) return 0; - - if (!todrop_rate[i]) return 0; - if (--todrop_counter[i] > 0) return 0; - - todrop_counter[i] = todrop_rate[i]; - return 1; -} - -/* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) -{ - int idx; - struct ip_vs_conn *cp; - - /* - * Randomly scan 1/32 of the whole table every second - */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; - - if (cp->protocol == IPPROTO_TCP) { - switch(cp->state) { - case IP_VS_TCP_S_SYN_RECV: - case IP_VS_TCP_S_SYNACK: - break; - - case IP_VS_TCP_S_ESTABLISHED: - if (todrop_entry(cp)) - break; - continue; - - default: - continue; - } - } else { - if (!todrop_entry(cp)) - continue; - } - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(hash); - } -} - - -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ -static void ip_vs_conn_flush(void) -{ - int idx; - struct ip_vs_conn *cp; - - flush_again: - for (idx=0; idxcontrol) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(idx); - } - - /* the counter may be not NULL, because maybe some conn entries - are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { - schedule(); - goto flush_again; - } -} - - -int __init ip_vs_conn_init(void) -{ - int idx; - - /* - * Allocate the connection hash table and initialize its list heads - */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); - if (!ip_vs_conn_tab) - return -ENOMEM; - - /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); - return -ENOMEM; - } - - IP_VS_INFO("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); - IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", - sizeof(struct ip_vs_conn)); - - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); - } - - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); - } - - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); - - /* calculate the random value for connection hash */ - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - - return 0; -} - - -void ip_vs_conn_cleanup(void) -{ - /* flush all the connection entries first */ - ip_vs_conn_flush(); - - /* Release the empty cache */ - kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); - vfree(ip_vs_conn_tab); -} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c deleted file mode 100644 index 958abf3..0000000 --- a/net/ipv4/ipvs/ip_vs_core.c +++ /dev/null @@ -1,1542 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. - * - * Changes: - * Paul `Rusty' Russell properly handle non-linear skbs - * Harald Welte don't use nfcache - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include /* for icmp_send */ -#include - -#include -#include - -#ifdef CONFIG_IP_VS_IPV6 -#include -#include -#endif - -#include - - -EXPORT_SYMBOL(register_ip_vs_scheduler); -EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); -EXPORT_SYMBOL(ip_vs_proto_name); -EXPORT_SYMBOL(ip_vs_conn_new); -EXPORT_SYMBOL(ip_vs_conn_in_get); -EXPORT_SYMBOL(ip_vs_conn_out_get); -#ifdef CONFIG_IP_VS_PROTO_TCP -EXPORT_SYMBOL(ip_vs_tcp_conn_listen); -#endif -EXPORT_SYMBOL(ip_vs_conn_put); -#ifdef CONFIG_IP_VS_DEBUG -EXPORT_SYMBOL(ip_vs_get_debug_level); -#endif - - -/* ID used in ICMP lookups */ -#define icmp_id(icmph) (((icmph)->un).echo.id) -#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) - -const char *ip_vs_proto_name(unsigned proto) -{ - static char buf[20]; - - switch (proto) { - case IPPROTO_IP: - return "IP"; - case IPPROTO_UDP: - return "UDP"; - case IPPROTO_TCP: - return "TCP"; - case IPPROTO_ICMP: - return "ICMP"; -#ifdef CONFIG_IP_VS_IPV6 - case IPPROTO_ICMPV6: - return "ICMPv6"; -#endif - default: - sprintf(buf, "IP_%d", proto); - return buf; - } -} - -void ip_vs_init_hash_table(struct list_head *table, int rows) -{ - while (--rows >= 0) - INIT_LIST_HEAD(&table[rows]); -} - -static inline void -ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.inpkts++; - dest->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.inpkts++; - dest->svc->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.inpkts++; - ip_vs_stats.ustats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.outpkts++; - dest->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.outpkts++; - dest->svc->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.outpkts++; - ip_vs_stats.ustats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) -{ - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.ustats.conns++; - spin_unlock(&cp->dest->stats.lock); - - spin_lock(&svc->stats.lock); - svc->stats.ustats.conns++; - spin_unlock(&svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.conns++; - spin_unlock(&ip_vs_stats.lock); -} - - -static inline int -ip_vs_set_state(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - if (unlikely(!pp->state_transition)) - return 0; - return pp->state_transition(cp, direction, skb, pp); -} - - -/* - * IPVS persistent scheduling function - * It creates a connection entry according to its template if exists, - * or selects a server and creates a connection entry plus a template. - * Locking: we are svc user (svc->refcnt), so we hold all dests too - * Protocols supported: TCP, UDP - */ -static struct ip_vs_conn * -ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, - __be16 ports[2]) -{ - struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest; - struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ - union nf_inet_addr snet; /* source network of the client, - after masking */ - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - /* Mask saddr with the netmask to adjust template granularity */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); - else -#endif - snet.ip = iph.saddr.ip & svc->netmask; - - IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " - "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), - IP_VS_DBG_ADDR(svc->af, &snet)); - - /* - * As far as we know, FTP is a very complicated network protocol, and - * it uses control connection and data connections. For active FTP, - * FTP server initialize data connection to the client, its source port - * is often 20. For passive FTP, FTP server tells the clients the port - * that it passively listens to, and the client issues the data - * connection. In the tunneling or direct routing mode, the load - * balancer is on the client-to-server half of connection, the port - * number is unknown to the load balancer. So, a conn template like - * is created for persistent FTP - * service, and a template like - * is created for other persistent services. - */ - if (ports[1] == svc->port) { - /* Check if a template already exists */ - if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, ports[1]); - else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * No template found or the dest of the connection - * template is not available. - */ - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template like for non-ftp service, - * and - * for ftp service. - */ - if (svc->port != FTPPORT) - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, - ports[1], - &dest->addr, dest->port, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = dest->port; - } else { - /* - * Note: persistent fwmark-based services and persistent - * port zero service are handled here. - * fwmark template: - * port zero template: - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, - &fwmark, 0); - } else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * If it is not persistent port zero, return NULL, - * otherwise create a connection template. - */ - if (svc->port) - return NULL; - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template according to the service - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_conn_new(svc->af, IPPROTO_IP, - &snet, 0, - &fwmark, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - } else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = ports[1]; - } - - /* - * Create a new connection according to the template - */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, ports[0], - &iph.daddr, ports[1], - &dest->addr, dport, - 0, - dest); - if (cp == NULL) { - ip_vs_conn_put(ct); - return NULL; - } - - /* - * Add its control - */ - ip_vs_control_add(cp, ct); - ip_vs_conn_put(ct); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * IPVS main scheduling function - * It selects a server according to the virtual service, and - * creates a connection entry. - * Protocols supported: TCP, UDP - */ -struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - /* - * Persistent service - */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr); - - /* - * Non-persistent service - */ - if (!svc->fwmark && pptr[1] != svc->port) { - if (!svc->port) - IP_VS_ERR("Schedule: port zero only supported " - "in persistent services, " - "check your ipvs configuration\n"); - return NULL; - } - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "Schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a connection entry. - */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &dest->addr, dest->port ? dest->port : pptr[1], - 0, - dest); - if (cp == NULL) - return NULL; - - IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " - "d:%s:%u conn->flags:%X conn->refcnt:%d\n", - ip_vs_fwd_tag(cp), - IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * Pass or drop the packet. - * Called by ip_vs_in, when the virtual service is available but - * no destination is available for a new connection. - */ -int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - __be16 _ports[2], *pptr; - struct ip_vs_iphdr iph; - int unicast; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); - if (pptr == NULL) { - ip_vs_service_put(svc); - return NF_DROP; - } - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; - else -#endif - unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); - - /* if it is fwmark-based service, the cache_bypass sysctl is up - and the destination is a non-local unicast, then create - a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { - int ret, cs; - struct ip_vs_conn *cp; - union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - - ip_vs_service_put(svc); - - /* create a new connection entry */ - IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &daddr, 0, - IP_VS_CONN_F_BYPASS, - NULL); - if (cp == NULL) - return NF_DROP; - - /* statistics */ - ip_vs_in_stats(cp, skb); - - /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - - /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - - atomic_inc(&cp->in_pkts); - ip_vs_conn_put(cp); - return ret; - } - - /* - * When the virtual ftp service is presented, packets destined - * for other services on the VIP may get here (except services - * listed in the ipvs table), pass the packets, because it is - * not ipvs job to decide to drop the packets. - */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); - return NF_ACCEPT; - } - - ip_vs_service_put(svc); - - /* - * Notify the client that the destination is unreachable, and - * release the socket buffer. - * Since it is in IP layer, the TCP socket is not actually - * created, the TCP RST packet cannot be sent, instead that - * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ - */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, - skb->dev); - else -#endif - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - - return NF_DROP; -} - - -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - -__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) -{ - return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); -} - -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - int err = ip_defrag(skb, user); - - if (!err) - ip_send_check(ip_hdr(skb)); - - return err; -} - -#ifdef CONFIG_IP_VS_IPV6 -static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) -{ - /* TODO IPv6: Find out what to do here for IPv6 */ - return 0; -} -#endif - -/* - * Packet has been made sufficiently writable in caller - * - inout: 1=in->out, 0=out->in - */ -void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct iphdr *iph = ip_hdr(skb); - unsigned int icmp_offset = iph->ihl*4; - struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + - icmp_offset); - struct iphdr *ciph = (struct iphdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr.ip; - ip_send_check(iph); - ciph->daddr = cp->vaddr.ip; - ip_send_check(ciph); - } else { - iph->daddr = cp->daddr.ip; - ip_send_check(iph); - ciph->saddr = cp->daddr.ip; - ip_send_check(ciph); - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { - __be16 *ports = (void *)ciph + ciph->ihl*4; - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->checksum = 0; - icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMP"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMP"); -} - -#ifdef CONFIG_IP_VS_IPV6 -void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - unsigned int icmp_offset = sizeof(struct ipv6hdr); - struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + - icmp_offset); - struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr.in6; - ciph->daddr = cp->vaddr.in6; - } else { - iph->daddr = cp->daddr.in6; - ciph->saddr = cp->daddr.in6; - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { - __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->icmp6_cksum = 0; - /* TODO IPv6: is this correct for ICMPv6? */ - ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); -} -#endif - -/* Handle relevant response ICMP messages - forward to the right - * destination host. Used for NAT and local client. - */ -static int handle_response_icmp(int af, struct sk_buff *skb, - union nf_inet_addr *snet, - __u8 protocol, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, - unsigned int offset, unsigned int ihl) -{ - unsigned int verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", - IP_VS_DBG_ADDR(af, snet)); - goto out; - } - - if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ip_vs_nat_icmp_v6(skb, pp, cp, 1); - else -#endif - ip_vs_nat_icmp(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - -out: - __ip_vs_conn_put(cp); - - return verdict; -} - -/* - * Handle ICMP messages in the inside-to-outside direction (outgoing). - * Find any that might be relevant, check against existing connections. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); - - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) - return NF_ACCEPT; - - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, offset, ihl); -} - -#ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) -{ - struct ipv6hdr *iph; - struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - NIP6(iph->saddr), NIP6(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) - return NF_ACCEPT; - - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, - pp, offset, sizeof(struct ipv6hdr)); -} -#endif - -static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) -{ - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - return th->rst; -} - -/* Handle response packets: rewrite addresses and send away... - * Used for NAT and local client. - */ -static unsigned int -handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int ihl) -{ - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - - if (!skb_make_writable(skb, ihl)) - goto drop; - - /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) - goto drop; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ipv6_hdr(skb)->saddr = cp->vaddr.in6; - else -#endif - { - ip_hdr(skb)->saddr = cp->vaddr.ip; - ip_send_check(ip_hdr(skb)); - } - - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else -#endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); - - ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - - skb->ipvs_property = 1; - - LeaveFunction(11); - return NF_ACCEPT; - -drop: - ip_vs_conn_put(cp); - kfree_skb(skb); - return NF_STOLEN; -} - -/* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. - * Check if outgoing packet belongs to the established ip_vs_conn. - */ -static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_vs_iphdr iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int af; - - EnterFunction(11); - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - if (skb->ipvs_property) - return NF_ACCEPT; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - } else -#endif - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* reassemble IP fragments */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - } else -#endif - if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - /* - * Check if the packet belongs to an existing entry - */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, - pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if (iph.protocol != IPPROTO_TCP - || !is_tcp_reset(skb, iph.len)) { -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0, skb->dev); - else -#endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } - } - } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - return handle_response(af, skb, pp, cp, iph.len); -} - - -/* - * Handle ICMP messages in the outside-to-inside direction (incoming). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); - - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (cp) { - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, - cih->protocol, cp, pp, - offset, ihl); - } - return NF_ACCEPT; - } - - verdict = NF_DROP; - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ - - out: - __ip_vs_conn_put(cp); - - return verdict; -} - -#ifdef CONFIG_IP_VS_IPV6 -static int -ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct ipv6hdr *iph; - struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, verdict; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : - IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - NIP6(iph->saddr), NIP6(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (cp) { - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, - cih->nexthdr, - cp, pp, offset, - sizeof(struct ipv6hdr)); - } - return NF_ACCEPT; - } - - verdict = NF_DROP; - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); - /* do not touch skb anymore */ - - __ip_vs_conn_put(cp); - - return verdict; -} -#endif - - -/* - * Check if it's for virtual services, look it up, - * and send it on its way... - */ -static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_vs_iphdr iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int ret, restart, af; - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - /* - * Big tappo: only PACKET_HOST, including loopback for local client - * Don't handle local packets on IPv6 for now - */ - if (unlikely(skb->pkt_type != PACKET_HOST)) { - IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", - skb->pkt_type, - iph.protocol, - IP_VS_DBG_ADDR(af, &iph.daddr)); - return NF_ACCEPT; - } - - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - /* Protocol supported? */ - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* - * Check if the packet belongs to an existing connection entry - */ - cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); - - if (unlikely(!cp)) { - int v; - - /* For local client packets, it could be a response */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (cp) - return handle_response(af, skb, pp, cp, iph.len); - - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) - return v; - } - - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); - - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { - /* the destination server is not available */ - - if (sysctl_ip_vs_expire_nodest_conn) { - /* try to expire the connection immediately */ - ip_vs_conn_expire_now(cp); - } - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); - return NF_DROP; - } - - ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - else { - IP_VS_DBG_RL("warning: packet_xmit is null"); - ret = NF_ACCEPT; - } - - /* Increase its packet counter and check if it is needed - * to be synchronized - * - * Sync connection if it is about to close to - * encorage the standby servers to update the connections timeout - */ - atomic_inc(&cp->in_pkts); - if (af == AF_INET && - (ip_vs_sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(cp); - cp->old_state = cp->state; - - ip_vs_conn_put(cp); - return ret; -} - - -/* - * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP - * related packets destined for 0.0.0.0/0. - * When fwmark-based virtual service is used, such as transparent - * cache cluster, TCP packets can be marked and routed to ip_vs_in, - * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and - * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain - * and send them to ip_vs_in_icmp. - */ -static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; - - return ip_vs_in_icmp(skb, &r, hooknum); -} - -#ifdef CONFIG_IP_VS_IPV6 -static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) - return NF_ACCEPT; - - return ip_vs_in_icmp_v6(skb, &r, hooknum); -} -#endif - - -static struct nf_hook_ops ip_vs_ops[] __read_mostly = { - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, -#ifdef CONFIG_IP_VS_IPV6 - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp_v6, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, -#endif -}; - - -/* - * Initialize IP Virtual Server - */ -static int __init ip_vs_init(void) -{ - int ret; - - ip_vs_estimator_init(); - - ret = ip_vs_control_init(); - if (ret < 0) { - IP_VS_ERR("can't setup control.\n"); - goto cleanup_estimator; - } - - ip_vs_protocol_init(); - - ret = ip_vs_app_init(); - if (ret < 0) { - IP_VS_ERR("can't setup application helper.\n"); - goto cleanup_protocol; - } - - ret = ip_vs_conn_init(); - if (ret < 0) { - IP_VS_ERR("can't setup connection table.\n"); - goto cleanup_app; - } - - ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) { - IP_VS_ERR("can't register hooks.\n"); - goto cleanup_conn; - } - - IP_VS_INFO("ipvs loaded.\n"); - return ret; - - cleanup_conn: - ip_vs_conn_cleanup(); - cleanup_app: - ip_vs_app_cleanup(); - cleanup_protocol: - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - cleanup_estimator: - ip_vs_estimator_cleanup(); - return ret; -} - -static void __exit ip_vs_cleanup(void) -{ - nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ip_vs_conn_cleanup(); - ip_vs_app_cleanup(); - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - ip_vs_estimator_cleanup(); - IP_VS_INFO("ipvs unloaded.\n"); -} - -module_init(ip_vs_init); -module_exit(ip_vs_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c deleted file mode 100644 index 0302cf3..0000000 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ /dev/null @@ -1,3443 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#ifdef CONFIG_IP_VS_IPV6 -#include -#include -#endif -#include -#include -#include - -#include - -#include - -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - -/* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; - - -#ifdef CONFIG_IP_VS_DEBUG -static int sysctl_ip_vs_debug_level = 0; - -int ip_vs_get_debug_level(void) -{ - return sysctl_ip_vs_debug_level; -} -#endif - -#ifdef CONFIG_IP_VS_IPV6 -/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) -{ - struct rt6_info *rt; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = *addr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) - return 1; - - return 0; -} -#endif -/* - * update_defense_level is called from keventd and from sysctl, - * so it needs to protect itself from softirqs - */ -static void update_defense_level(void) -{ - struct sysinfo i; - static int old_secure_tcp = 0; - int availmem; - int nomem; - int to_change = -1; - - /* we only count free and buffered memory (in pages) */ - si_meminfo(&i); - availmem = i.freeram + i.bufferram; - /* however in linux 2.5 the i.bufferram is total page cache size, - we need adjust it */ - /* si_swapinfo(&i); */ - /* availmem = availmem - (i.totalswap - i.freeswap); */ - - nomem = (availmem < sysctl_ip_vs_amemthresh); - - local_bh_disable(); - - /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { - case 0: - atomic_set(&ip_vs_dropentry, 0); - break; - case 1: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; - } else { - atomic_set(&ip_vs_dropentry, 0); - } - break; - case 2: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; - }; - break; - case 3: - atomic_set(&ip_vs_dropentry, 1); - break; - } - spin_unlock(&__ip_vs_dropentry_lock); - - /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { - case 0: - ip_vs_drop_rate = 0; - break; - case 1: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; - } else { - ip_vs_drop_rate = 0; - } - break; - case 2: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; - } - break; - case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; - break; - } - spin_unlock(&__ip_vs_droppacket_lock); - - /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { - case 0: - if (old_secure_tcp >= 2) - to_change = 0; - break; - case 1: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - sysctl_ip_vs_secure_tcp = 2; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - } - break; - case 2: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - sysctl_ip_vs_secure_tcp = 1; - } - break; - case 3: - if (old_secure_tcp < 2) - to_change = 1; - break; - } - old_secure_tcp = sysctl_ip_vs_secure_tcp; - if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); - - local_bh_enable(); -} - - -/* - * Timer for checking the defense - */ -#define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); - -static void defense_work_handler(struct work_struct *work) -{ - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); - - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); -} - -int -ip_vs_use_count_inc(void) -{ - return try_module_get(THIS_MODULE); -} - -void -ip_vs_use_count_dec(void) -{ - module_put(THIS_MODULE); -} - - -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) - -/* the service table hashed by */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; -/* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); - - -/* - * Returns hash value for virtual service - */ -static __inline__ unsigned -ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, - __be16 port) -{ - register unsigned porth = ntohs(port); - __be32 addr_fold = addr->ip; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; -} - -/* - * Returns hash value of fwmark for virtual service lookup - */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) -{ - return fwmark & IP_VS_SVC_TAB_MASK; -} - -/* - * Hashes a service in the ip_vs_svc_table by - * or in the ip_vs_svc_fwm_table by fwmark. - * Should be called with locked tables. - */ -static int ip_vs_svc_hash(struct ip_vs_service *svc) -{ - unsigned hash; - - if (svc->flags & IP_VS_SVC_F_HASHED) { - IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* - * Hash it by in ip_vs_svc_table - */ - hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, - svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); - } else { - /* - * Hash it by fwmark in ip_vs_svc_fwm_table - */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); - } - - svc->flags |= IP_VS_SVC_F_HASHED; - /* increase its refcnt because it is referenced by the svc table */ - atomic_inc(&svc->refcnt); - return 1; -} - - -/* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. - * Should be called with locked tables. - */ -static int ip_vs_svc_unhash(struct ip_vs_service *svc) -{ - if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ - list_del(&svc->s_list); - } else { - /* Remove it from the ip_vs_svc_fwm_table table */ - list_del(&svc->f_list); - } - - svc->flags &= ~IP_VS_SVC_F_HASHED; - atomic_dec(&svc->refcnt); - return 1; -} - - -/* - * Get service by {proto,addr,port} in the service table. - */ -static inline struct ip_vs_service * -__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, - __be16 vport) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); - - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->af == af) - && ip_vs_addr_equal(af, &svc->addr, vaddr) - && (svc->port == vport) - && (svc->protocol == protocol)) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - - -/* - * Get service by {fwmark} in the service table. - */ -static inline struct ip_vs_service * -__ip_vs_svc_fwm_get(int af, __u32 fwmark) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); - - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark && svc->af == af) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - -struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) -{ - struct ip_vs_service *svc; - - read_lock(&__ip_vs_svc_lock); - - /* - * Check the table hashed by fwmark first - */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) - goto out; - - /* - * Check the table hashed by - * for "full" addressed entries - */ - svc = __ip_vs_service_get(af, protocol, vaddr, vport); - - if (svc == NULL - && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) - && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { - /* - * Check if ftp service entry exists, the packet - * might belong to FTP data connections. - */ - svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); - } - - if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { - /* - * Check if the catch-all port (port zero) exists - */ - svc = __ip_vs_service_get(af, protocol, vaddr, 0); - } - - out: - read_unlock(&__ip_vs_svc_lock); - - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", - fwmark, ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), - svc ? "hit" : "not hit"); - - return svc; -} - - -static inline void -__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - atomic_inc(&svc->refcnt); - dest->svc = svc; -} - -static inline void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) -{ - struct ip_vs_service *svc = dest->svc; - - dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) - kfree(svc); -} - - -/* - * Returns hash value for real service - */ -static inline unsigned ip_vs_rs_hashkey(int af, - const union nf_inet_addr *addr, - __be16 port) -{ - register unsigned porth = ntohs(port); - __be32 addr_fold = addr->ip; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - - return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) - & IP_VS_RTAB_MASK; -} - -/* - * Hashes ip_vs_dest in ip_vs_rtable by . - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) -{ - unsigned hash; - - if (!list_empty(&dest->d_list)) { - return 0; - } - - /* - * Hash by proto,addr,port, - * which are the parameters of the real service. - */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - - list_add(&dest->d_list, &ip_vs_rtable[hash]); - - return 1; -} - -/* - * UNhashes ip_vs_dest from ip_vs_rtable. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) -{ - /* - * Remove it from the ip_vs_rtable table. - */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); - } - - return 1; -} - -/* - * Lookup real service by in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) -{ - unsigned hash; - struct ip_vs_dest *dest; - - /* - * Check for "full" addressed entries - * Return the first found entry - */ - hash = ip_vs_rs_hashkey(af, daddr, dport); - - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { - /* HIT */ - read_unlock(&__ip_vs_rs_lock); - return dest; - } - } - read_unlock(&__ip_vs_rs_lock); - - return NULL; -} - -/* - * Lookup destination by {addr,port} in the given service - */ -static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) -{ - struct ip_vs_dest *dest; - - /* - * Find the destination for the given service - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->af == svc->af) - && ip_vs_addr_equal(svc->af, &dest->addr, daddr) - && (dest->port == dport)) { - /* HIT */ - return dest; - } - } - - return NULL; -} - -/* - * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in - * the backup synchronization daemon. It finds the - * destination to be bound to the received connection - * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. - */ -struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, - __be16 dport, - const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol) -{ - struct ip_vs_dest *dest; - struct ip_vs_service *svc; - - svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); - if (!svc) - return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); - return dest; -} - -/* - * Lookup dest by {svc,addr,port} in the destination trash. - * The destination trash is used to hold the destinations that are removed - * from the service table but are still referenced by some conn entries. - * The reason to add the destination trash is when the dest is temporary - * down (either by administrator or by monitor program), the dest can be - * picked back from the trash, the remaining connections to the dest can - * continue, and the counting information of the dest is also useful for - * scheduling. - */ -static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) -{ - struct ip_vs_dest *dest, *nxt; - - /* - * Find the destination in trash - */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " - "dest->refcnt=%d\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - if (dest->af == svc->af && - ip_vs_addr_equal(svc->af, &dest->addr, daddr) && - dest->port == dport && - dest->vfwmark == svc->fwmark && - dest->protocol == svc->protocol && - (svc->fwmark || - (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && - dest->vport == svc->port))) { - /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } - } - - return NULL; -} - - -/* - * Clean up all the destinations in the trash - * Called by the ip_vs_control_cleanup() - * - * When the ip_vs_control_clearup is activated by ipvs module exit, - * the service tables must have been flushed and all the connections - * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. - */ -static void ip_vs_trash_cleanup(void) -{ - struct ip_vs_dest *dest, *nxt; - - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } -} - - -static void -ip_vs_zero_stats(struct ip_vs_stats *stats) -{ - spin_lock_bh(&stats->lock); - - memset(&stats->ustats, 0, sizeof(stats->ustats)); - ip_vs_zero_estimator(stats); - - spin_unlock_bh(&stats->lock); -} - -/* - * Update a destination in the given service - */ -static void -__ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) -{ - int conn_flags; - - /* set the weight and the flags */ - atomic_set(&dest->weight, udest->weight); - conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; - - /* check if local node and update the flags */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - } else -#endif - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ - if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { - conn_flags |= IP_VS_CONN_F_NOOUTPUT; - } else { - /* - * Put the real service in ip_vs_rtable if not present. - * For now only for NAT! - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - } - atomic_set(&dest->conn_flags, conn_flags); - - /* bind the service */ - if (!dest->svc) { - __ip_vs_bind_svc(dest, svc); - } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); - ip_vs_zero_stats(&dest->stats); - __ip_vs_bind_svc(dest, svc); - } - } - - /* set the dest status flags */ - dest->flags |= IP_VS_DEST_F_AVAILABLE; - - if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - dest->u_threshold = udest->u_threshold; - dest->l_threshold = udest->l_threshold; -} - - -/* - * Create a destination for the given service - */ -static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, - struct ip_vs_dest **dest_p) -{ - struct ip_vs_dest *dest; - unsigned atype; - - EnterFunction(2); - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - atype = ipv6_addr_type(&udest->addr.in6); - if ((!(atype & IPV6_ADDR_UNICAST) || - atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(&udest->addr.in6)) - return -EINVAL; - } else -#endif - { - atype = inet_addr_type(&init_net, udest->addr.ip); - if (atype != RTN_LOCAL && atype != RTN_UNICAST) - return -EINVAL; - } - - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); - if (dest == NULL) { - IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); - return -ENOMEM; - } - - dest->af = svc->af; - dest->protocol = svc->protocol; - dest->vaddr = svc->addr; - dest->vport = svc->port; - dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); - dest->port = udest->port; - - atomic_set(&dest->activeconns, 0); - atomic_set(&dest->inactconns, 0); - atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 0); - - INIT_LIST_HEAD(&dest->d_list); - spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); - __ip_vs_update_dest(svc, dest, udest); - ip_vs_new_estimator(&dest->stats); - - *dest_p = dest; - - LeaveFunction(2); - return 0; -} - - -/* - * Add a destination into an existing service - */ -static int -ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - union nf_inet_addr daddr; - __be16 dport = udest->port; - int ret; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - - /* - * Check if the dest already exists in the list - */ - dest = ip_vs_lookup_dest(svc, &daddr, dport); - - if (dest != NULL) { - IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); - return -EEXIST; - } - - /* - * Check if the dest already exists in the trash and - * is from the same service - */ - dest = ip_vs_trash_get_dest(svc, &daddr, dport); - - if (dest != NULL) { - IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " - "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), - atomic_read(&dest->refcnt), - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->vaddr), - ntohs(dest->vport)); - - __ip_vs_update_dest(svc, dest, udest); - - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - - ip_vs_new_estimator(&dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - return 0; - } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; - } - - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Edit a destination in the given service - */ -static int -ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - union nf_inet_addr daddr; - __be16 dport = udest->port; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - - /* - * Lookup the destination list - */ - dest = ip_vs_lookup_dest(svc, &daddr, dport); - - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); - return -ENOENT; - } - - __ip_vs_update_dest(svc, dest, udest); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Delete a destination (must be already unlinked from the service) - */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) -{ - ip_vs_kill_estimator(&dest->stats); - - /* - * Remove it from the d-linked list with the real services. - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); - atomic_inc(&dest->refcnt); - } -} - - -/* - * Unlink a destination from the given service - */ -static void __ip_vs_unlink_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, - int svcupd) -{ - dest->flags &= ~IP_VS_DEST_F_AVAILABLE; - - /* - * Remove it from the d-linked destination list. - */ - list_del(&dest->n_list); - svc->num_dests--; - - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); -} - - -/* - * Delete a destination server in the given service - */ -static int -ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - __be16 dport = udest->port; - - EnterFunction(2); - - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); - - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); - return -ENOENT; - } - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Unlink dest from the service - */ - __ip_vs_unlink_dest(svc, dest, 1); - - write_unlock_bh(&__ip_vs_svc_lock); - - /* - * Delete the destination - */ - __ip_vs_del_dest(dest); - - LeaveFunction(2); - - return 0; -} - - -/* - * Add a service into the service hash table - */ -static int -ip_vs_add_service(struct ip_vs_service_user_kern *u, - struct ip_vs_service **svc_p) -{ - int ret = 0; - struct ip_vs_scheduler *sched = NULL; - struct ip_vs_service *svc = NULL; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - ret = -ENOENT; - goto out_mod_dec; - } - -#ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6) { - if (!sched->supports_ipv6) { - ret = -EAFNOSUPPORT; - goto out_err; - } - if ((u->netmask < 1) || (u->netmask > 128)) { - ret = -EINVAL; - goto out_err; - } - } -#endif - - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); - if (svc == NULL) { - IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); - ret = -ENOMEM; - goto out_err; - } - - /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); - atomic_set(&svc->refcnt, 0); - - svc->af = u->af; - svc->protocol = u->protocol; - ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); - svc->port = u->port; - svc->fwmark = u->fwmark; - svc->flags = u->flags; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); - - /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; - - /* Update the virtual service counters */ - if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); - - ip_vs_new_estimator(&svc->stats); - - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - ip_vs_num_services++; - - /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); - - *svc_p = svc; - return 0; - - out_err: - if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - kfree(svc); - } - ip_vs_scheduler_put(sched); - - out_mod_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -/* - * Edit a service and bind it with a new scheduler - */ -static int -ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) -{ - struct ip_vs_scheduler *sched, *old_sched; - int ret = 0; - - /* - * Lookup the scheduler, by 'u->sched_name' - */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - return -ENOENT; - } - old_sched = sched; - -#ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6) { - if (!sched->supports_ipv6) { - ret = -EAFNOSUPPORT; - goto out; - } - if ((u->netmask < 1) || (u->netmask > 128)) { - ret = -EINVAL; - goto out; - } - } -#endif - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Set the flags and timeout value - */ - svc->flags = u->flags | IP_VS_SVC_F_HASHED; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); -#ifdef CONFIG_IP_VS_IPV6 - out: -#endif - - if (old_sched) - ip_vs_scheduler_put(old_sched); - - return ret; -} - - -/* - * Delete a service from the service list - * - The service must be unlinked, unlocked and not referenced! - * - We are called under _bh lock - */ -static void __ip_vs_del_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest, *nxt; - struct ip_vs_scheduler *old_sched; - - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - ip_vs_num_services--; - - ip_vs_kill_estimator(&svc->stats); - - /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); - - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - - /* - * Unlink the whole destination list - */ - list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { - __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); - } - - /* - * Update the virtual service counters - */ - if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); - - /* - * Free the service if nobody refers to it - */ - if (atomic_read(&svc->refcnt) == 0) - kfree(svc); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - -/* - * Delete a service from the service list - */ -static int ip_vs_del_service(struct ip_vs_service *svc) -{ - if (svc == NULL) - return -EEXIST; - - /* - * Unhash it from the service table - */ - write_lock_bh(&__ip_vs_svc_lock); - - ip_vs_svc_unhash(svc); - - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - return 0; -} - - -/* - * Flush all the virtual services - */ -static int ip_vs_flush(void) -{ - int idx; - struct ip_vs_service *svc, *nxt; - - /* - * Flush the service table hashed by - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - /* - * Flush the service table hashed by fwmark - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - return 0; -} - - -/* - * Zero counters in a service or all services - */ -static int ip_vs_zero_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - - write_lock_bh(&__ip_vs_svc_lock); - list_for_each_entry(dest, &svc->destinations, n_list) { - ip_vs_zero_stats(&dest->stats); - } - ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); - return 0; -} - -static int ip_vs_zero_all(void) -{ - int idx; - struct ip_vs_service *svc; - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); - } - } - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); - } - } - - ip_vs_zero_stats(&ip_vs_stats); - return 0; -} - - -static int -proc_do_defense_mode(ctl_table *table, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val = *valp; - int rc; - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 3)) { - /* Restore the correct value */ - *valp = val; - } else { - update_defense_level(); - } - } - return rc; -} - - -static int -proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val[2]; - int rc; - - /* backup the value first */ - memcpy(val, valp, sizeof(val)); - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { - /* Restore the correct value */ - memcpy(valp, val, sizeof(val)); - } - return rc; -} - - -/* - * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) - */ - -static struct ctl_table vs_vars[] = { - { - .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_IP_VS_DEBUG - { - .procname = "debug_level", - .data = &sysctl_ip_vs_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, -#if 0 - { - .procname = "timeout_established", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synsent", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synrecv", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_finwait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_timewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_close", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_closewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_lastack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_listen", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_udp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_icmp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, -#endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = &proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; - -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); - -static struct ctl_table_header * sysctl_header; - -#ifdef CONFIG_PROC_FS - -struct ip_vs_iter { - struct list_head *table; - int bucket; -}; - -/* - * Write the contents of the VS rule table to a PROCfs file. - * (It is kept just for backward compatibility) - */ -static inline const char *ip_vs_fwd_name(unsigned flags) -{ - switch (flags & IP_VS_CONN_F_FWD_MASK) { - case IP_VS_CONN_F_LOCALNODE: - return "Local"; - case IP_VS_CONN_F_TUNNEL: - return "Tunnel"; - case IP_VS_CONN_F_DROUTE: - return "Route"; - default: - return "Masq"; - } -} - - -/* Get the Nth entry in the two lists */ -static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) -{ - struct ip_vs_iter *iter = seq->private; - int idx; - struct ip_vs_service *svc; - - /* look in hash by protocol */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ - iter->table = ip_vs_svc_table; - iter->bucket = idx; - return svc; - } - } - } - - /* keep looking in fwmark */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { - iter->table = ip_vs_svc_fwm_table; - iter->bucket = idx; - return svc; - } - } - } - - return NULL; -} - -static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) -{ - - read_lock_bh(&__ip_vs_svc_lock); - return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; -} - - -static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct list_head *e; - struct ip_vs_iter *iter; - struct ip_vs_service *svc; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_info_array(seq,0); - - svc = v; - iter = seq->private; - - if (iter->table == ip_vs_svc_table) { - /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - - - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { - return svc; - } - } - - iter->table = ip_vs_svc_fwm_table; - iter->bucket = -1; - goto scan_fwmark; - } - - /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); - - scan_fwmark: - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) - return svc; - } - - return NULL; -} - -static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) -{ - read_unlock_bh(&__ip_vs_svc_lock); -} - - -static int ip_vs_info_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - seq_puts(seq, - "Prot LocalAddress:Port Scheduler Flags\n"); - seq_puts(seq, - " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); - } else { - const struct ip_vs_service *svc = v; - const struct ip_vs_iter *iter = seq->private; - const struct ip_vs_dest *dest; - - if (iter->table == ip_vs_svc_table) { -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", - ip_vs_proto_name(svc->protocol), - NIP6(svc->addr.in6), - ntohs(svc->port), - svc->scheduler->name); - else -#endif - seq_printf(seq, "%s %08X:%04X %s ", - ip_vs_proto_name(svc->protocol), - ntohl(svc->addr.ip), - ntohs(svc->port), - svc->scheduler->name); - } else { - seq_printf(seq, "FWM %08X %s ", - svc->fwmark, svc->scheduler->name); - } - - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - seq_printf(seq, "persistent %d %08X\n", - svc->timeout, - ntohl(svc->netmask)); - else - seq_putc(seq, '\n'); - - list_for_each_entry(dest, &svc->destinations, n_list) { -#ifdef CONFIG_IP_VS_IPV6 - if (dest->af == AF_INET6) - seq_printf(seq, - " -> [" NIP6_FMT "]:%04X" - " %-7s %-6d %-10d %-10d\n", - NIP6(dest->addr.in6), - ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - else -#endif - seq_printf(seq, - " -> %08X:%04X " - "%-7s %-6d %-10d %-10d\n", - ntohl(dest->addr.ip), - ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - - } - } - return 0; -} - -static const struct seq_operations ip_vs_info_seq_ops = { - .start = ip_vs_info_seq_start, - .next = ip_vs_info_seq_next, - .stop = ip_vs_info_seq_stop, - .show = ip_vs_info_seq_show, -}; - -static int ip_vs_info_open(struct inode *inode, struct file *file) -{ - return seq_open_private(file, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); -} - -static const struct file_operations ip_vs_info_fops = { - .owner = THIS_MODULE, - .open = ip_vs_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -#endif - -struct ip_vs_stats ip_vs_stats = { - .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), -}; - -#ifdef CONFIG_PROC_FS -static int ip_vs_stats_show(struct seq_file *seq, void *v) -{ - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Total Incoming Outgoing Incoming Outgoing\n"); - seq_printf(seq, - " Conns Packets Packets Bytes Bytes\n"); - - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, - ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, - (unsigned long long) ip_vs_stats.ustats.inbytes, - (unsigned long long) ip_vs_stats.ustats.outbytes); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.ustats.cps, - ip_vs_stats.ustats.inpps, - ip_vs_stats.ustats.outpps, - ip_vs_stats.ustats.inbps, - ip_vs_stats.ustats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); - - return 0; -} - -static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip_vs_stats_show, NULL); -} - -static const struct file_operations ip_vs_stats_fops = { - .owner = THIS_MODULE, - .open = ip_vs_stats_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif - -/* - * Set timeout values for tcp tcpfin udp in the timeout_table. - */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) -{ - IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", - u->tcp_timeout, - u->tcp_fin_timeout, - u->udp_timeout); - -#ifdef CONFIG_IP_VS_PROTO_TCP - if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] - = u->tcp_timeout * HZ; - } - - if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] - = u->tcp_fin_timeout * HZ; - } -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP - if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] - = u->udp_timeout * HZ; - } -#endif - return 0; -} - - -#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) -#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ - sizeof(struct ip_vs_dest_user)) -#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) -#define MAX_ARG_LEN SVCDEST_ARG_LEN - -static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { - [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, - [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, -}; - -static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, - struct ip_vs_service_user *usvc_compat) -{ - usvc->af = AF_INET; - usvc->protocol = usvc_compat->protocol; - usvc->addr.ip = usvc_compat->addr; - usvc->port = usvc_compat->port; - usvc->fwmark = usvc_compat->fwmark; - - /* Deep copy of sched_name is not needed here */ - usvc->sched_name = usvc_compat->sched_name; - - usvc->flags = usvc_compat->flags; - usvc->timeout = usvc_compat->timeout; - usvc->netmask = usvc_compat->netmask; -} - -static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, - struct ip_vs_dest_user *udest_compat) -{ - udest->addr.ip = udest_compat->addr; - udest->port = udest_compat->port; - udest->conn_flags = udest_compat->conn_flags; - udest->weight = udest_compat->weight; - udest->u_threshold = udest_compat->u_threshold; - udest->l_threshold = udest_compat->l_threshold; -} - -static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) -{ - int ret; - unsigned char arg[MAX_ARG_LEN]; - struct ip_vs_service_user *usvc_compat; - struct ip_vs_service_user_kern usvc; - struct ip_vs_service *svc; - struct ip_vs_dest_user *udest_compat; - struct ip_vs_dest_user_kern udest; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (len != set_arglen[SET_CMDID(cmd)]) { - IP_VS_ERR("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, len) != 0) - return -EFAULT; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - if (mutex_lock_interruptible(&__ip_vs_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } - - if (cmd == IP_VS_SO_SET_FLUSH) { - /* Flush the virtual service */ - ret = ip_vs_flush(); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_TIMEOUT) { - /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); - goto out_unlock; - } - - usvc_compat = (struct ip_vs_service_user *)arg; - udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); - - /* We only use the new structs internally, so copy userspace compat - * structs to extended internal versions */ - ip_vs_copy_usvc_compat(&usvc, usvc_compat); - ip_vs_copy_udest_compat(&udest, udest_compat); - - if (cmd == IP_VS_SO_SET_ZERO) { - /* if no service address is set, zero counters in all */ - if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(); - goto out_unlock; - } - } - - /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { - IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", - usvc.protocol, NIPQUAD(usvc.addr.ip), - ntohs(usvc.port), usvc.sched_name); - ret = -EFAULT; - goto out_unlock; - } - - /* Lookup the exact service by or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - - if (cmd != IP_VS_SO_SET_ADD - && (svc == NULL || svc->protocol != usvc.protocol)) { - ret = -ESRCH; - goto out_unlock; - } - - switch (cmd) { - case IP_VS_SO_SET_ADD: - if (svc != NULL) - ret = -EEXIST; - else - ret = ip_vs_add_service(&usvc, &svc); - break; - case IP_VS_SO_SET_EDIT: - ret = ip_vs_edit_service(svc, &usvc); - break; - case IP_VS_SO_SET_DEL: - ret = ip_vs_del_service(svc); - if (!ret) - goto out_unlock; - break; - case IP_VS_SO_SET_ZERO: - ret = ip_vs_zero_service(svc); - break; - case IP_VS_SO_SET_ADDDEST: - ret = ip_vs_add_dest(svc, &udest); - break; - case IP_VS_SO_SET_EDITDEST: - ret = ip_vs_edit_dest(svc, &udest); - break; - case IP_VS_SO_SET_DELDEST: - ret = ip_vs_del_dest(svc, &udest); - break; - default: - ret = -EINVAL; - } - - if (svc) - ip_vs_service_put(svc); - - out_unlock: - mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - spin_lock_bh(&src->lock); - memcpy(dst, &src->ustats, sizeof(*dst)); - spin_unlock_bh(&src->lock); -} - -static void -ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) -{ - dst->protocol = src->protocol; - dst->addr = src->addr.ip; - dst->port = src->port; - dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); - dst->flags = src->flags; - dst->timeout = src->timeout / HZ; - dst->netmask = src->netmask; - dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); -} - -static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, - struct ip_vs_get_services __user *uptr) -{ - int idx, count=0; - struct ip_vs_service *svc; - struct ip_vs_service_entry entry; - int ret = 0; - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; - - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; - - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - out: - return ret; -} - -static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, - struct ip_vs_get_dests __user *uptr) -{ - struct ip_vs_service *svc; - union nf_inet_addr addr = { .ip = get->addr }; - int ret = 0; - - if (get->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); - else - svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, - get->port); - - if (svc) { - int count = 0; - struct ip_vs_dest *dest; - struct ip_vs_dest_entry entry; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (count >= get->num_dests) - break; - - entry.addr = dest->addr.ip; - entry.port = dest->port; - entry.conn_flags = atomic_read(&dest->conn_flags); - entry.weight = atomic_read(&dest->weight); - entry.u_threshold = dest->u_threshold; - entry.l_threshold = dest->l_threshold; - entry.activeconns = atomic_read(&dest->activeconns); - entry.inactconns = atomic_read(&dest->inactconns); - entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - break; - } - count++; - } - ip_vs_service_put(svc); - } else - ret = -ESRCH; - return ret; -} - -static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) -{ -#ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; -#endif -} - - -#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) -#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) -#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) -#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) -#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) - -static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { - [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, - [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, -}; - -static int -do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - unsigned char arg[128]; - int ret = 0; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (*len < get_arglen[GET_CMDID(cmd)]) { - IP_VS_ERR("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) - return -EFAULT; - - if (mutex_lock_interruptible(&__ip_vs_mutex)) - return -ERESTARTSYS; - - switch (cmd) { - case IP_VS_SO_GET_VERSION: - { - char buf[64]; - - sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - if (copy_to_user(user, buf, strlen(buf)+1) != 0) { - ret = -EFAULT; - goto out; - } - *len = strlen(buf)+1; - } - break; - - case IP_VS_SO_GET_INFO: - { - struct ip_vs_getinfo info; - info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; - info.num_services = ip_vs_num_services; - if (copy_to_user(user, &info, sizeof(info)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_SERVICES: - { - struct ip_vs_get_services *get; - int size; - - get = (struct ip_vs_get_services *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_service_entry) * get->num_services; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_service_entries(get, user); - } - break; - - case IP_VS_SO_GET_SERVICE: - { - struct ip_vs_service_entry *entry; - struct ip_vs_service *svc; - union nf_inet_addr addr; - - entry = (struct ip_vs_service_entry *)arg; - addr.ip = entry->addr; - if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); - else - svc = __ip_vs_service_get(AF_INET, entry->protocol, - &addr, entry->port); - if (svc) { - ip_vs_copy_service(entry, svc); - if (copy_to_user(user, entry, sizeof(*entry)) != 0) - ret = -EFAULT; - ip_vs_service_put(svc); - } else - ret = -ESRCH; - } - break; - - case IP_VS_SO_GET_DESTS: - { - struct ip_vs_get_dests *get; - int size; - - get = (struct ip_vs_get_dests *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_dest_entry) * get->num_dests; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_dest_entries(get, user); - } - break; - - case IP_VS_SO_GET_TIMEOUT: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - if (copy_to_user(user, &t, sizeof(t)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_DAEMON: - { - struct ip_vs_daemon_user d[2]; - - memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { - d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; - } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { - d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; - } - if (copy_to_user(user, &d, sizeof(d)) != 0) - ret = -EFAULT; - } - break; - - default: - ret = -EINVAL; - } - - out: - mutex_unlock(&__ip_vs_mutex); - return ret; -} - - -static struct nf_sockopt_ops ip_vs_sockopts = { - .pf = PF_INET, - .set_optmin = IP_VS_BASE_CTL, - .set_optmax = IP_VS_SO_SET_MAX+1, - .set = do_ip_vs_set_ctl, - .get_optmin = IP_VS_BASE_CTL, - .get_optmax = IP_VS_SO_GET_MAX+1, - .get = do_ip_vs_get_ctl, - .owner = THIS_MODULE, -}; - -/* - * Generic Netlink interface - */ - -/* IPVS genetlink family */ -static struct genl_family ip_vs_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = IPVS_GENL_NAME, - .version = IPVS_GENL_VERSION, - .maxattr = IPVS_CMD_MAX, -}; - -/* Policy used for first-level command attributes */ -static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { - [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, - [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, - [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ -static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { - [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, - [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, - .len = IP_VS_IFNAME_MAXLEN }, - [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ -static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { - [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, - .len = sizeof(union nf_inet_addr) }, - [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, - .len = IP_VS_SCHEDNAME_MAXLEN }, - [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, - .len = sizeof(struct ip_vs_flags) }, - [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ -static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { - [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, - .len = sizeof(union nf_inet_addr) }, - [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, - [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, -}; - -static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, - struct ip_vs_stats *stats) -{ - struct nlattr *nl_stats = nla_nest_start(skb, container_type); - if (!nl_stats) - return -EMSGSIZE; - - spin_lock_bh(&stats->lock); - - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); - - spin_unlock_bh(&stats->lock); - - nla_nest_end(skb, nl_stats); - - return 0; - -nla_put_failure: - spin_unlock_bh(&stats->lock); - nla_nest_cancel(skb, nl_stats); - return -EMSGSIZE; -} - -static int ip_vs_genl_fill_service(struct sk_buff *skb, - struct ip_vs_service *svc) -{ - struct nlattr *nl_service; - struct ip_vs_flags flags = { .flags = svc->flags, - .mask = ~0 }; - - nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); - if (!nl_service) - return -EMSGSIZE; - - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - - if (svc->fwmark) { - NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); - } else { - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); - NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); - } - - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); - NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - - if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) - goto nla_put_failure; - - nla_nest_end(skb, nl_service); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_service); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_service(struct sk_buff *skb, - struct ip_vs_service *svc, - struct netlink_callback *cb) -{ - void *hdr; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_SERVICE); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_service(skb, svc) < 0) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_services(struct sk_buff *skb, - struct netlink_callback *cb) -{ - int idx = 0, i; - int start = cb->args[0]; - struct ip_vs_service *svc; - - mutex_lock(&__ip_vs_mutex); - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - } - - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - } - -nla_put_failure: - mutex_unlock(&__ip_vs_mutex); - cb->args[0] = idx; - - return skb->len; -} - -static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, - struct nlattr *nla, int full_entry) -{ - struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; - struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; - - /* Parse mandatory identifying service fields first */ - if (nla == NULL || - nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) - return -EINVAL; - - nla_af = attrs[IPVS_SVC_ATTR_AF]; - nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; - nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; - nla_port = attrs[IPVS_SVC_ATTR_PORT]; - nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; - - if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) - return -EINVAL; - - usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif - return -EAFNOSUPPORT; - - if (nla_fwmark) { - usvc->protocol = IPPROTO_TCP; - usvc->fwmark = nla_get_u32(nla_fwmark); - } else { - usvc->protocol = nla_get_u16(nla_protocol); - nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); - usvc->fwmark = 0; - } - - /* If a full entry was requested, check for the additional fields */ - if (full_entry) { - struct nlattr *nla_sched, *nla_flags, *nla_timeout, - *nla_netmask; - struct ip_vs_flags flags; - struct ip_vs_service *svc; - - nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; - nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; - nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; - nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; - - if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) - return -EINVAL; - - nla_memcpy(&flags, nla_flags, sizeof(flags)); - - /* prefill flags from service if it already exists */ - if (usvc->fwmark) - svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); - else - svc = __ip_vs_service_get(usvc->af, usvc->protocol, - &usvc->addr, usvc->port); - if (svc) { - usvc->flags = svc->flags; - ip_vs_service_put(svc); - } else - usvc->flags = 0; - - /* set new flags from userland */ - usvc->flags = (usvc->flags & ~flags.mask) | - (flags.flags & flags.mask); - usvc->sched_name = nla_data(nla_sched); - usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); - } - - return 0; -} - -static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) -{ - struct ip_vs_service_user_kern usvc; - int ret; - - ret = ip_vs_genl_parse_service(&usvc, nla, 0); - if (ret) - return ERR_PTR(ret); - - if (usvc.fwmark) - return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - else - return __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); -} - -static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) -{ - struct nlattr *nl_dest; - - nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); - if (!nl_dest) - return -EMSGSIZE; - - NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); - NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - - NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, - atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - - if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) - goto nla_put_failure; - - nla_nest_end(skb, nl_dest); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_dest); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, - struct netlink_callback *cb) -{ - void *hdr; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_DEST); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_dest(skb, dest) < 0) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_dests(struct sk_buff *skb, - struct netlink_callback *cb) -{ - int idx = 0; - int start = cb->args[0]; - struct ip_vs_service *svc; - struct ip_vs_dest *dest; - struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; - - mutex_lock(&__ip_vs_mutex); - - /* Try to find the service for which to dump destinations */ - if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, - IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) - goto out_err; - - svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); - if (IS_ERR(svc) || svc == NULL) - goto out_err; - - /* Dump the destinations */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - -nla_put_failure: - cb->args[0] = idx; - ip_vs_service_put(svc); - -out_err: - mutex_unlock(&__ip_vs_mutex); - - return skb->len; -} - -static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, - struct nlattr *nla, int full_entry) -{ - struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; - struct nlattr *nla_addr, *nla_port; - - /* Parse mandatory identifying destination fields first */ - if (nla == NULL || - nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) - return -EINVAL; - - nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; - nla_port = attrs[IPVS_DEST_ATTR_PORT]; - - if (!(nla_addr && nla_port)) - return -EINVAL; - - nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); - - /* If a full entry was requested, check for the additional fields */ - if (full_entry) { - struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh; - - nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; - nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; - nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; - nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; - - if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) - return -EINVAL; - - udest->conn_flags = nla_get_u32(nla_fwd) - & IP_VS_CONN_F_FWD_MASK; - udest->weight = nla_get_u32(nla_weight); - udest->u_threshold = nla_get_u32(nla_u_thresh); - udest->l_threshold = nla_get_u32(nla_l_thresh); - } - - return 0; -} - -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) -{ - struct nlattr *nl_daemon; - - nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); - if (!nl_daemon) - return -EMSGSIZE; - - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); - NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - - nla_nest_end(skb, nl_daemon); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_daemon); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, - struct netlink_callback *cb) -{ - void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_DAEMON); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_daemons(struct sk_buff *skb, - struct netlink_callback *cb) -{ - mutex_lock(&__ip_vs_mutex); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { - if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ip_vs_master_mcast_ifn, - ip_vs_master_syncid, cb) < 0) - goto nla_put_failure; - - cb->args[0] = 1; - } - - if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { - if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ip_vs_backup_mcast_ifn, - ip_vs_backup_syncid, cb) < 0) - goto nla_put_failure; - - cb->args[1] = 1; - } - -nla_put_failure: - mutex_unlock(&__ip_vs_mutex); - - return skb->len; -} - -static int ip_vs_genl_new_daemon(struct nlattr **attrs) -{ - if (!(attrs[IPVS_DAEMON_ATTR_STATE] && - attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && - attrs[IPVS_DAEMON_ATTR_SYNC_ID])) - return -EINVAL; - - return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); -} - -static int ip_vs_genl_del_daemon(struct nlattr **attrs) -{ - if (!attrs[IPVS_DAEMON_ATTR_STATE]) - return -EINVAL; - - return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); -} - -static int ip_vs_genl_set_config(struct nlattr **attrs) -{ - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) - t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) - t.tcp_fin_timeout = - nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) - t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - - return ip_vs_set_timeout(&t); -} - -static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) -{ - struct ip_vs_service *svc = NULL; - struct ip_vs_service_user_kern usvc; - struct ip_vs_dest_user_kern udest; - int ret = 0, cmd; - int need_full_svc = 0, need_full_dest = 0; - - cmd = info->genlhdr->cmd; - - mutex_lock(&__ip_vs_mutex); - - if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(); - goto out; - } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(info->attrs); - goto out; - } else if (cmd == IPVS_CMD_NEW_DAEMON || - cmd == IPVS_CMD_DEL_DAEMON) { - - struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; - - if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || - nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, - info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) { - ret = -EINVAL; - goto out; - } - - if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(daemon_attrs); - else - ret = ip_vs_genl_del_daemon(daemon_attrs); - goto out; - } else if (cmd == IPVS_CMD_ZERO && - !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(); - goto out; - } - - /* All following commands require a service argument, so check if we - * received a valid one. We need a full service specification when - * adding / editing a service. Only identifying members otherwise. */ - if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) - need_full_svc = 1; - - ret = ip_vs_genl_parse_service(&usvc, - info->attrs[IPVS_CMD_ATTR_SERVICE], - need_full_svc); - if (ret) - goto out; - - /* Lookup the exact service by or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - - /* Unless we're adding a new service, the service must already exist */ - if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { - ret = -ESRCH; - goto out; - } - - /* Destination commands require a valid destination argument. For - * adding / editing a destination, we need a full destination - * specification. */ - if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || - cmd == IPVS_CMD_DEL_DEST) { - if (cmd != IPVS_CMD_DEL_DEST) - need_full_dest = 1; - - ret = ip_vs_genl_parse_dest(&udest, - info->attrs[IPVS_CMD_ATTR_DEST], - need_full_dest); - if (ret) - goto out; - } - - switch (cmd) { - case IPVS_CMD_NEW_SERVICE: - if (svc == NULL) - ret = ip_vs_add_service(&usvc, &svc); - else - ret = -EEXIST; - break; - case IPVS_CMD_SET_SERVICE: - ret = ip_vs_edit_service(svc, &usvc); - break; - case IPVS_CMD_DEL_SERVICE: - ret = ip_vs_del_service(svc); - break; - case IPVS_CMD_NEW_DEST: - ret = ip_vs_add_dest(svc, &udest); - break; - case IPVS_CMD_SET_DEST: - ret = ip_vs_edit_dest(svc, &udest); - break; - case IPVS_CMD_DEL_DEST: - ret = ip_vs_del_dest(svc, &udest); - break; - case IPVS_CMD_ZERO: - ret = ip_vs_zero_service(svc); - break; - default: - ret = -EINVAL; - } - -out: - if (svc) - ip_vs_service_put(svc); - mutex_unlock(&__ip_vs_mutex); - - return ret; -} - -static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) -{ - struct sk_buff *msg; - void *reply; - int ret, cmd, reply_cmd; - - cmd = info->genlhdr->cmd; - - if (cmd == IPVS_CMD_GET_SERVICE) - reply_cmd = IPVS_CMD_NEW_SERVICE; - else if (cmd == IPVS_CMD_GET_INFO) - reply_cmd = IPVS_CMD_SET_INFO; - else if (cmd == IPVS_CMD_GET_CONFIG) - reply_cmd = IPVS_CMD_SET_CONFIG; - else { - IP_VS_ERR("unknown Generic Netlink command\n"); - return -EINVAL; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&__ip_vs_mutex); - - reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); - if (reply == NULL) - goto nla_put_failure; - - switch (cmd) { - case IPVS_CMD_GET_SERVICE: - { - struct ip_vs_service *svc; - - svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); - if (IS_ERR(svc)) { - ret = PTR_ERR(svc); - goto out_err; - } else if (svc) { - ret = ip_vs_genl_fill_service(msg, svc); - ip_vs_service_put(svc); - if (ret) - goto nla_put_failure; - } else { - ret = -ESRCH; - goto out_err; - } - - break; - } - - case IPVS_CMD_GET_CONFIG: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); -#ifdef CONFIG_IP_VS_PROTO_TCP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, - t.tcp_fin_timeout); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); -#endif - - break; - } - - case IPVS_CMD_GET_INFO: - NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); - NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - IP_VS_CONN_TAB_SIZE); - break; - } - - genlmsg_end(msg, reply); - ret = genlmsg_unicast(msg, info->snd_pid); - goto out; - -nla_put_failure: - IP_VS_ERR("not enough space in Netlink message\n"); - ret = -EMSGSIZE; - -out_err: - nlmsg_free(msg); -out: - mutex_unlock(&__ip_vs_mutex); - - return ret; -} - - -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { - { - .cmd = IPVS_CMD_NEW_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_SET_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_SERVICE, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - .dumpit = ip_vs_genl_dump_services, - .policy = ip_vs_cmd_policy, - }, - { - .cmd = IPVS_CMD_NEW_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_SET_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .dumpit = ip_vs_genl_dump_dests, - }, - { - .cmd = IPVS_CMD_NEW_DAEMON, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_DAEMON, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_DAEMON, - .flags = GENL_ADMIN_PERM, - .dumpit = ip_vs_genl_dump_daemons, - }, - { - .cmd = IPVS_CMD_SET_CONFIG, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_CONFIG, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - }, - { - .cmd = IPVS_CMD_GET_INFO, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - }, - { - .cmd = IPVS_CMD_ZERO, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_FLUSH, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_set_cmd, - }, -}; - -static int __init ip_vs_genl_register(void) -{ - int ret, i; - - ret = genl_register_family(&ip_vs_genl_family); - if (ret) - return ret; - - for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { - ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); - if (ret) - goto err_out; - } - return 0; - -err_out: - genl_unregister_family(&ip_vs_genl_family); - return ret; -} - -static void ip_vs_genl_unregister(void) -{ - genl_unregister_family(&ip_vs_genl_family); -} - -/* End of Generic Netlink interface definitions */ - - -int __init ip_vs_control_init(void) -{ - int ret; - int idx; - - EnterFunction(2); - - ret = nf_register_sockopt(&ip_vs_sockopts); - if (ret) { - IP_VS_ERR("cannot register sockopt.\n"); - return ret; - } - - ret = ip_vs_genl_register(); - if (ret) { - IP_VS_ERR("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - return ret; - } - - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); - } - - ip_vs_new_estimator(&ip_vs_stats); - - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); - - LeaveFunction(2); - return 0; -} - - -void ip_vs_control_cleanup(void) -{ - EnterFunction(2); - ip_vs_trash_cleanup(); - cancel_rearming_delayed_work(&defense_work); - cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); - LeaveFunction(2); -} diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c deleted file mode 100644 index 2eb2860..0000000 --- a/net/ipv4/ipvs/ip_vs_est.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * ip_vs_est.c: simple rate estimator for IPVS - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - This code is to estimate rate in a shorter interval (such as 8 - seconds) for virtual services and real servers. For measure rate in a - long interval, it is easy to implement a user level daemon which - periodically reads those statistical counters and measure rate. - - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - - We measure rate during the last 8 seconds every 2 seconds: - - avgrate = avgrate*(1-W) + rate*W - - where W = 2^(-2) - - NOTES. - - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. - - * A lot code is taken from net/sched/estimator.c - */ - - -static void estimation_timer(unsigned long arg); - -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); - -static void estimation_timer(unsigned long arg) -{ - struct ip_vs_estimator *e; - struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; - - spin_lock(&est_lock); - list_for_each_entry(e, &est_list, list) { - s = container_of(e, struct ip_vs_stats, est); - - spin_lock(&s->lock); - n_conns = s->ustats.conns; - n_inpkts = s->ustats.inpkts; - n_outpkts = s->ustats.outpkts; - n_inbytes = s->ustats.inbytes; - n_outbytes = s->ustats.outbytes; - - /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->ustats.cps = (e->cps+0x1FF)>>10; - - rate = (n_inpkts - e->last_inpkts)<<9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->ustats.inpps = (e->inpps+0x1FF)>>10; - - rate = (n_outpkts - e->last_outpkts)<<9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->ustats.outpps = (e->outpps+0x1FF)>>10; - - rate = (n_inbytes - e->last_inbytes)<<4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->ustats.inbps = (e->inbps+0xF)>>5; - - rate = (n_outbytes - e->last_outbytes)<<4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->ustats.outbps = (e->outbps+0xF)>>5; - spin_unlock(&s->lock); - } - spin_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); -} - -void ip_vs_new_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - INIT_LIST_HEAD(&est->list); - - est->last_conns = stats->ustats.conns; - est->cps = stats->ustats.cps<<10; - - est->last_inpkts = stats->ustats.inpkts; - est->inpps = stats->ustats.inpps<<10; - - est->last_outpkts = stats->ustats.outpkts; - est->outpps = stats->ustats.outpps<<10; - - est->last_inbytes = stats->ustats.inbytes; - est->inbps = stats->ustats.inbps<<5; - - est->last_outbytes = stats->ustats.outbytes; - est->outbps = stats->ustats.outbps<<5; - - spin_lock_bh(&est_lock); - list_add(&est->list, &est_list); - spin_unlock_bh(&est_lock); -} - -void ip_vs_kill_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - spin_lock_bh(&est_lock); - list_del(&est->list); - spin_unlock_bh(&est_lock); -} - -void ip_vs_zero_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - /* set counters zero, caller must hold the stats->lock lock */ - est->last_inbytes = 0; - est->last_outbytes = 0; - est->last_conns = 0; - est->last_inpkts = 0; - est->last_outpkts = 0; - est->cps = 0; - est->inpps = 0; - est->outpps = 0; - est->inbps = 0; - est->outbps = 0; -} - -int __init ip_vs_estimator_init(void) -{ - mod_timer(&est_timer, jiffies + 2 * HZ); - return 0; -} - -void ip_vs_estimator_cleanup(void) -{ - del_timer_sync(&est_timer); -} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c deleted file mode 100644 index 2e7dbd8..0000000 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * ip_vs_ftp.c: IPVS ftp application module - * - * Authors: Wensong Zhang - * - * Changes: - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference - * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. - * - * IP_MASQ_FTP ftp masquerading module - * - * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 - * - * Author: Wouter Gadeyne - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " - - -/* - * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper - * First port is set to the default port. - */ -static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); -MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); - - -/* Dummy variable */ -static int ip_vs_ftp_pasv; - - -static int -ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -static int -ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -/* - * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. - * is in network order. - */ -static int ip_vs_ftp_get_addrport(char *data, char *data_limit, - const char *pattern, size_t plen, char term, - __be32 *addr, __be16 *port, - char **start, char **end) -{ - unsigned char p[6]; - int i = 0; - - if (data_limit - data < plen) { - /* check if there is partial match */ - if (strnicmp(data, pattern, data_limit - data) == 0) - return -1; - else - return 0; - } - - if (strnicmp(data, pattern, plen) != 0) { - return 0; - } - *start = data + plen; - - for (data = *start; *data != term; data++) { - if (data == data_limit) - return -1; - } - *end = data; - - memset(p, 0, sizeof(p)); - for (data = *start; data != *end; data++) { - if (*data >= '0' && *data <= '9') { - p[i] = p[i]*10 + *data - '0'; - } else if (*data == ',' && i < 5) { - i++; - } else { - /* unexpected character */ - return -1; - } - } - - if (i != 5) - return -1; - - *addr = get_unaligned((__be32 *)p); - *port = get_unaligned((__be16 *)(p + 4)); - return 1; -} - - -/* - * Look at outgoing ftp packets to catch the response to a PASV command - * from the server (inside-to-outside). - * When we see one, we build a connection entry with the client address, - * client port 0 (unknown at the moment), the server address and the - * server port. Mark the current connection entry as a control channel - * of the new entry. All this work is just to make the data connection - * can be scheduled to the right server later. - * - * The outgoing packet should be something like - * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". - * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. - */ -static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_limit; - char *start, *end; - union nf_inet_addr from; - __be16 port; - struct ip_vs_conn *n_cp; - char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; - int ret; - -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, ')', - &from.ip, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " - "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from.ip), ntohs(port), - NIPQUAD(cp->caddr.ip), 0); - - /* - * Now update or create an connection entry for it - */ - n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, - &cp->caddr, 0); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &cp->caddr, 0, - &cp->vaddr, port, - &from, port, - IP_VS_CONN_F_NO_CPORT, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Replace the old passive address with the new one - */ - from.ip = n_cp->vaddr.ip; - port = n_cp->vport; - sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), - (ntohs(port)>>8)&255, ntohs(port)&255); - buf_len = strlen(buf); - - /* - * Calculate required delta-offset to keep TCP happy - */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; - } - return 1; -} - - -/* - * Look at incoming ftp packets to catch the PASV/PORT command - * (outside-to-inside). - * - * The incoming packet having the PORT command should be something like - * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". - * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. - * In this case, we create a connection entry using the client address and - * port, so that the active ftp data connection from the server can reach - * the client. - */ -static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_start, *data_limit; - char *start, *end; - union nf_inet_addr to; - __be16 port; - struct ip_vs_conn *n_cp; - -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - - /* no diff required for incoming packets */ - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - while (data <= data_limit - 6) { - if (strnicmp(data, "PASV\r\n", 6) == 0) { - /* Passive mode on */ - IP_VS_DBG(7, "got PASV at %td of %td\n", - data - data_start, - data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; - return 1; - } - data++; - } - - /* - * To support virtual FTP server, the scenerio is as follows: - * FTP client ----> Load Balancer ----> FTP server - * First detect the port number in the application data, - * then create a new connection entry for the coming data - * connection. - */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to.ip, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", - NIPQUAD(to.ip), ntohs(port)); - - /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", - ip_vs_proto_name(iph->protocol), - NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); - - n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1), - &cp->daddr, htons(ntohs(cp->dport)-1), - 0, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Move tunnel to listen state - */ - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - - return 1; -} - - -static struct ip_vs_app ip_vs_ftp = { - .name = "ftp", - .type = IP_VS_APP_TYPE_FTP, - .protocol = IPPROTO_TCP, - .module = THIS_MODULE, - .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), - .init_conn = ip_vs_ftp_init_conn, - .done_conn = ip_vs_ftp_done_conn, - .bind_conn = NULL, - .unbind_conn = NULL, - .pkt_out = ip_vs_ftp_out, - .pkt_in = ip_vs_ftp_in, -}; - - -/* - * ip_vs_ftp initialization - */ -static int __init ip_vs_ftp_init(void) -{ - int i, ret; - struct ip_vs_app *app = &ip_vs_ftp; - - ret = register_ip_vs_app(app); - if (ret) - return ret; - - for (i=0; iprotocol, ports[i]); - if (ret) - break; - IP_VS_INFO("%s: loaded support on port[%d] = %d\n", - app->name, i, ports[i]); - } - - if (ret) - unregister_ip_vs_app(app); - - return ret; -} - - -/* - * ip_vs_ftp finish. - */ -static void __exit ip_vs_ftp_exit(void) -{ - unregister_ip_vs_app(&ip_vs_ftp); -} - - -module_init(ip_vs_ftp_init); -module_exit(ip_vs_ftp_exit); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c deleted file mode 100644 index 6ecef35..0000000 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ /dev/null @@ -1,555 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Martin Hamilton : fixed the terrible locking bugs - * *lock(tbl->lock) ==> *lock(&tbl->lock) - * Wensong Zhang : fixed the uninitilized tbl->lock bug - * Wensong Zhang : added doing full expiration check to - * collect stale entries of 24+ hours when - * no partial expire check in a half hour - * Julian Anastasov : replaced del_timer call with del_timer_sync - * to avoid the possible race between timer - * handler and del_timer thread in SMP - * - */ - -/* - * The lblc algorithm is as follows (pseudo code): - * - * if cachenode[dest_ip] is null then - * n, cachenode[dest_ip] <- {weighted least-conn node}; - * else - * n <- cachenode[dest_ip]; - * if (n is dead) OR - * (n.conns>n.weight AND - * there is a node m with m.conns -#include -#include -#include -#include - -/* for sysctl */ -#include -#include - -#include - - -/* - * It is for garbage collection of stale IPVS lblc entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblc entry hash table - */ -#ifndef CONFIG_IP_VS_LBLC_TAB_BITS -#define CONFIG_IP_VS_LBLC_TAB_BITS 10 -#endif -#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS -#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) -#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) - - -/* - * IPVS lblc entry represents an association between destination - * IP address and its destination server - */ -struct ip_vs_lblc_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblc hash table - */ -struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLC sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) -{ - list_del(&en->list); - /* - * We don't kfree dest because it is refered either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLC entry - */ -static inline unsigned ip_vs_lblc_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblc_table. - * returns bool success. - */ -static void -ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) -{ - unsigned hash = ip_vs_lblc_hashkey(en->addr); - - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); -} - - -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) -{ - unsigned hash = ip_vs_lblc_hashkey(addr); - struct ip_vs_lblc_entry *en; - - list_for_each_entry(en, &tbl->bucket[hash], list) - if (en->addr == addr) - return en; - - return NULL; -} - - -/* - * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, - struct ip_vs_dest *dest) -{ - struct ip_vs_lblc_entry *en; - - en = ip_vs_lblc_get(tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } - - en->addr = daddr; - en->lastuse = jiffies; - - atomic_inc(&dest->refcnt); - en->dest = dest; - - ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } - - return en; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) -{ - struct ip_vs_lblc_entry *en, *nxt; - int i; - - for (i=0; ibucket[i], list) { - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - } -} - - -static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; - unsigned long now = jiffies; - int i, j; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&svc->sched_lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblc table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblc_check_expire(unsigned long data) -{ - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblc_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblc_entry *en, *nxt; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblc_full_check(svc); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&svc->sched_lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - - -static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblc_table *tbl; - - /* - * Allocate the ip_vs_lblc_table for this service - */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " - "current service\n", sizeof(*tbl)); - - /* - * Initialize the hash buckets - */ - for (i=0; ibucket[i]); - } - tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)svc); - mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); - - return 0; -} - - -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); - - /* release the table itself */ - kfree(tbl); - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", - sizeof(*tbl)); - - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest = NULL; - struct ip_vs_lblc_entry *en; - - IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); - - /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblc_get(tbl, iph->daddr); - if (en) { - /* We only hold a read lock, but this is atomic */ - en->lastuse = jiffies; - - /* - * If the destination is not available, i.e. it's in the trash, - * we must ignore it, as it may be removed from under our feet, - * if someone drops our reference count. Our caller only makes - * sure that destinations, that are not in the trash, are not - * moved to the trash, while we are scheduling. But anyone can - * free up entries from the trash at any time. - */ - - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; - } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; - - /* No cache entry or it is invalid, time to schedule */ - dest = __ip_vs_lblc_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - - /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, iph->daddr, dest); - write_unlock(&svc->sched_lock); - -out: - IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLC Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ - .name = "lblc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_lblc_init_svc, - .done_service = ip_vs_lblc_done_svc, - .schedule = ip_vs_lblc_schedule, -}; - - -static int __init ip_vs_lblc_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblc_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); -} - - -module_init(ip_vs_lblc_init); -module_exit(ip_vs_lblc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c deleted file mode 100644 index 1f75ea8..0000000 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ /dev/null @@ -1,755 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection with Replication scheduler - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Julian Anastasov : Added the missing (dest->weight>0) - * condition in the ip_vs_dest_set_max. - * - */ - -/* - * The lblc/r algorithm is as follows (pseudo code): - * - * if serverSet[dest_ip] is null then - * n, serverSet[dest_ip] <- {weighted least-conn node}; - * else - * n <- {least-conn (alive) node in serverSet[dest_ip]}; - * if (n is null) OR - * (n.conns>n.weight AND - * there is a node m with m.conns 1 AND - * now - serverSet[dest_ip].lastMod > T then - * m <- {most conn node in serverSet[dest_ip]}; - * remove m from serverSet[dest_ip]; - * if serverSet[dest_ip] changed then - * serverSet[dest_ip].lastMod <- now; - * - * return n; - * - */ - -#include -#include -#include -#include -#include - -/* for sysctl */ -#include -#include -#include - -#include - - -/* - * It is for garbage collection of stale IPVS lblcr entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblcr entry hash table - */ -#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS -#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 -#endif -#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS -#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) -#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) - - -/* - * IPVS destination set structure and operations - */ -struct ip_vs_dest_list { - struct ip_vs_dest_list *next; /* list link */ - struct ip_vs_dest *dest; /* destination server */ -}; - -struct ip_vs_dest_set { - atomic_t size; /* set size */ - unsigned long lastmod; /* last modified time */ - struct ip_vs_dest_list *list; /* destination list */ - rwlock_t lock; /* lock for this list */ -}; - - -static struct ip_vs_dest_list * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e; - - for (e=set->list; e!=NULL; e=e->next) { - if (e->dest == dest) - /* already existed */ - return NULL; - } - - e = kmalloc(sizeof(*e), GFP_ATOMIC); - if (e == NULL) { - IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); - return NULL; - } - - atomic_inc(&dest->refcnt); - e->dest = dest; - - /* link it to the list */ - e->next = set->list; - set->list = e; - atomic_inc(&set->size); - - set->lastmod = jiffies; - return e; -} - -static void -ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e, **ep; - - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - if (e->dest == dest) { - /* HIT */ - *ep = e->next; - atomic_dec(&set->size); - set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - kfree(e); - break; - } - ep = &e->next; - } -} - -static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) -{ - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - *ep = e->next; - /* - * We don't kfree dest because it is refered either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - kfree(e); - } - write_unlock(&set->lock); -} - -/* get weighted least-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *least; - int loh, doh; - - if (set == NULL) - return NULL; - - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - least = e->dest; - if (least->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if ((atomic_read(&least->weight) > 0) - && (least->flags & IP_VS_DEST_F_AVAILABLE)) { - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* find the destination with the weighted least load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) - && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - return least; -} - - -/* get weighted most-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *most; - int moh, doh; - - if (set == NULL) - return NULL; - - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - most = e->dest; - if (atomic_read(&most->weight) > 0) { - moh = atomic_read(&most->activeconns) * 50 - + atomic_read(&most->inactconns); - goto nextstage; - } - } - return NULL; - - /* find the destination with the weighted most load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) - && (atomic_read(&dest->weight) > 0)) { - most = dest; - moh = doh; - } - } - - IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(most->addr.ip), ntohs(most->port), - atomic_read(&most->activeconns), - atomic_read(&most->refcnt), - atomic_read(&most->weight), moh); - return most; -} - - -/* - * IPVS lblcr entry represents an association between destination - * IP address and its destination server set - */ -struct ip_vs_lblcr_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest_set set; /* destination server set */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblcr hash table - */ -struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLCR sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) -{ - list_del(&en->list); - ip_vs_dest_set_eraseall(&en->set); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLCR entry - */ -static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblcr_table. - * returns bool success. - */ -static void -ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) -{ - unsigned hash = ip_vs_lblcr_hashkey(en->addr); - - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); -} - - -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) -{ - unsigned hash = ip_vs_lblcr_hashkey(addr); - struct ip_vs_lblcr_entry *en; - - list_for_each_entry(en, &tbl->bucket[hash], list) - if (en->addr == addr) - return en; - - return NULL; -} - - -/* - * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, - struct ip_vs_dest *dest) -{ - struct ip_vs_lblcr_entry *en; - - en = ip_vs_lblcr_get(tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); - return NULL; - } - - en->addr = daddr; - en->lastuse = jiffies; - - /* initilize its dest set */ - atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); - - ip_vs_lblcr_hash(tbl, en); - } - - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - - return en; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) -{ - int i; - struct ip_vs_lblcr_entry *en, *nxt; - - /* No locking required, only called during cleanup. */ - for (i=0; ibucket[i], list) { - ip_vs_lblcr_free(en); - } - } -} - - -static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&svc->sched_lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblcr table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblcr_check_expire(unsigned long data) -{ - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblcr_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblcr_full_check(svc); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&svc->sched_lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - -static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblcr_table *tbl; - - /* - * Allocate the ip_vs_lblcr_table for this service - */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " - "current service\n", sizeof(*tbl)); - - /* - * Initialize the hash buckets - */ - for (i=0; ibucket[i]); - } - tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)svc); - mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); - - return 0; -} - - -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); - - /* release the table itself */ - kfree(tbl); - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", - sizeof(*tbl)); - - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest = NULL; - struct ip_vs_lblcr_entry *en; - - IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); - - /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblcr_get(tbl, iph->daddr); - if (en) { - /* We only hold a read lock, but this is atomic */ - en->lastuse = jiffies; - - /* Get the least loaded destination */ - read_lock(&en->set.lock); - dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); - - /* More than one destination + enough time passed by, cleanup */ - if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + - sysctl_ip_vs_lblcr_expiration)) { - struct ip_vs_dest *m; - - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); - } - - /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); - goto out; - } - - /* The cache entry is invalid, time to schedule */ - dest = __ip_vs_lblcr_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - read_unlock(&svc->sched_lock); - return NULL; - } - - /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) - goto out; - - /* No cache entry, time to schedule */ - dest = __ip_vs_lblcr_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - - /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, iph->daddr, dest); - write_unlock(&svc->sched_lock); - -out: - IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLCR Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblcr_scheduler = -{ - .name = "lblcr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_lblcr_init_svc, - .done_service = ip_vs_lblcr_done_svc, - .schedule = ip_vs_lblcr_schedule, -}; - - -static int __init ip_vs_lblcr_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblcr_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); -} - - -module_init(ip_vs_lblcr_init); -module_exit(ip_vs_lblcr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c deleted file mode 100644 index b69f808..0000000 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * IPVS: Least-Connection Scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : added the ip_vs_lc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); - - /* - * Simply select the server with the least number of - * (activeconns<<5) + inactconns - * Except whose weight is equal to zero. - * If the weight is equal to zero, it means that the server is - * quiesced, the existing connections to the server still get - * served, but no new connection is assigned to the server. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || - atomic_read(&dest->weight) == 0) - continue; - doh = ip_vs_lc_dest_overhead(dest); - if (!least || doh < loh) { - least = dest; - loh = doh; - } - } - - if (least) - IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->inactconns)); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_lc_scheduler = { - .name = "lc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_lc_schedule, -}; - - -static int __init ip_vs_lc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; -} - -static void __exit ip_vs_lc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); -} - -module_init(ip_vs_lc_init); -module_exit(ip_vs_lc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c deleted file mode 100644 index 9a2d803..0000000 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * IPVS: Never Queue scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The NQ algorithm adopts a two-speed model. When there is an idle server - * available, the job will be sent to the idle server, instead of waiting - * for a fast one. When there is no idle server available, the job will be - * sent to the server that minimize its expected delay (The Shortest - * Expected Delay scheduling algorithm). - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri for talking NQ to me. - * - * The difference between NQ and SED is that NQ can improve overall - * system utilization. - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - - if (dest->flags & IP_VS_DEST_F_OVERLOAD || - !atomic_read(&dest->weight)) - continue; - - doh = ip_vs_nq_dest_overhead(dest); - - /* return the server directly if it is idle */ - if (atomic_read(&dest->activeconns) == 0) { - least = dest; - loh = doh; - goto out; - } - - if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { - least = dest; - loh = doh; - } - } - - if (!least) - return NULL; - - out: - IP_VS_DBG_BUF(6, "NQ: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_nq_scheduler = -{ - .name = "nq", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_nq_schedule, -}; - - -static int __init ip_vs_nq_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -static void __exit ip_vs_nq_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -module_init(ip_vs_nq_init); -module_exit(ip_vs_nq_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c deleted file mode 100644 index 0791f9e..0000000 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * ip_vs_proto.c: transport protocol load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -/* - * IPVS protocols can only be registered/unregistered when the ipvs - * module is loaded/unloaded, so no lock is needed in accessing the - * ipvs protocol table. - */ - -#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ -#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) - -static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; - - -/* - * register an ipvs protocol - */ -static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp->next = ip_vs_proto_table[hash]; - ip_vs_proto_table[hash] = pp; - - if (pp->init != NULL) - pp->init(pp); - - return 0; -} - - -/* - * unregister an ipvs protocol - */ -static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp_p = &ip_vs_proto_table[hash]; - for (; *pp_p; pp_p = &(*pp_p)->next) { - if (*pp_p == pp) { - *pp_p = pp->next; - if (pp->exit != NULL) - pp->exit(pp); - return 0; - } - } - - return -ESRCH; -} - - -/* - * get ip_vs_protocol object by its proto. - */ -struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) -{ - struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); - - for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { - if (pp->protocol == proto) - return pp; - } - - return NULL; -} - - -/* - * Propagate event for state change to all protocols - */ -void ip_vs_protocol_timeout_change(int flags) -{ - struct ip_vs_protocol *pp; - int i; - - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); - } - } -} - - -int * -ip_vs_create_timeout_table(int *table, int size) -{ - return kmemdup(table, size, GFP_ATOMIC); -} - - -/* - * Set timeout value for state specified by name - */ -int -ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) -{ - int i; - - if (!table || !name || !to) - return -EINVAL; - - for (i = 0; i < num; i++) { - if (strcmp(names[i], name)) - continue; - table[i] = to * HZ; - return 0; - } - return -ENOENT; -} - - -const char * ip_vs_state_name(__u16 proto, int state) -{ - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; - return pp->state_name(state); -} - - -static void -ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[128]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->frag_off & htons(IP_OFFSET)) - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else { - __be16 _ports[2], *pptr -; - pptr = skb_header_pointer(skb, offset + ih->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, - NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else - sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", - pp->name, - NIPQUAD(ih->saddr), - ntohs(pptr[0]), - NIPQUAD(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[192]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->nexthdr == IPPROTO_FRAGMENT) - sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", - pp->name, NIP6(ih->saddr), - NIP6(ih->daddr)); - else { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, - pp->name, - NIP6(ih->saddr), - NIP6(ih->daddr)); - else - sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", - pp->name, - NIP6(ih->saddr), - ntohs(pptr[0]), - NIP6(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} -#endif - - -void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); -} - - -int __init ip_vs_protocol_init(void) -{ - char protocols[64]; -#define REGISTER_PROTOCOL(p) \ - do { \ - register_ip_vs_protocol(p); \ - strcat(protocols, ", "); \ - strcat(protocols, (p)->name); \ - } while (0) - - protocols[0] = '\0'; - protocols[2] = '\0'; -#ifdef CONFIG_IP_VS_PROTO_TCP - REGISTER_PROTOCOL(&ip_vs_protocol_tcp); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - REGISTER_PROTOCOL(&ip_vs_protocol_udp); -#endif -#ifdef CONFIG_IP_VS_PROTO_AH - REGISTER_PROTOCOL(&ip_vs_protocol_ah); -#endif -#ifdef CONFIG_IP_VS_PROTO_ESP - REGISTER_PROTOCOL(&ip_vs_protocol_esp); -#endif - IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); - - return 0; -} - - -void ip_vs_protocol_cleanup(void) -{ - struct ip_vs_protocol *pp; - int i; - - /* unregister all the ipvs protocols */ - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - while ((pp = ip_vs_proto_table[i]) != NULL) - unregister_ip_vs_protocol(pp); - } -} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c deleted file mode 100644 index 80ab0c8..0000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov , February 2002 - * Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " - "%s%s %s->%s\n", - inverse ? "ICMP+" : "", - pp->name, - IP_VS_DBG_ADDR(af, &iph->saddr), - IP_VS_DBG_ADDR(af, &iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -ah_esp_conn_out_get(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " - "%s%s %s->%s\n", - inverse ? "ICMP+" : "", - pp->name, - IP_VS_DBG_ADDR(af, &iph->saddr), - IP_VS_DBG_ADDR(af, &iph->daddr)); - } - - return cp; -} - - -static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * AH/ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, - pp->name, NIP6(ih->saddr), - NIP6(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} -#endif - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ah_esp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ah_esp_debug_packet_v4(pp, skb, offset, msg); -} - - -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -#ifdef CONFIG_IP_VS_PROTO_AH -struct ip_vs_protocol ip_vs_protocol_ah = { - .name = "AH", - .protocol = IPPROTO_AH, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, -}; -#endif - -#ifdef CONFIG_IP_VS_PROTO_ESP -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; -#endif diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c deleted file mode 100644 index dd4566e..0000000 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ /dev/null @@ -1,732 +0,0 @@ -/* - * ip_vs_proto_tcp.c: TCP load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include /* for tcphdr */ -#include -#include /* for csum_tcpudp_magic */ -#include -#include -#include - -#include - - -static struct ip_vs_conn * -tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - - -static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct tcphdr _tcph, *th; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); - if (th == NULL) { - *verdict = NF_DROP; - return 0; - } - - if (th->syn && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, - th->dest))) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -tcp_fast_csum_update(int af, struct tcphdr *tcph, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldport, __be16 newport) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); - else -#endif - tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); -} - - -static inline void -tcp_partial_csum_update(int af, struct tcphdr *tcph, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldlen, __be16 newlen) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); - else -#endif - tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); -} - - -static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - unsigned int tcphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - oldlen = skb->len - tcphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - tcph = (void *)skb_network_header(skb) + tcphoff; - tcph->source = cp->vport; - - /* Adjust TCP checksums */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - tcphoff)); - } else if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcph->check = csum_ipv6_magic(&cp->vaddr.in6, - &cp->caddr.in6, - skb->len - tcphoff, - cp->protocol, skb->csum); - else -#endif - tcph->check = csum_tcpudp_magic(cp->vaddr.ip, - cp->caddr.ip, - skb->len - tcphoff, - cp->protocol, - skb->csum); - - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, tcph->check, - (char*)&(tcph->check) - (char*)tcph); - } - return 1; -} - - -static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - unsigned int tcphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - oldlen = skb->len - tcphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn and iph ack_seq stuff - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - tcph = (void *)skb_network_header(skb) + tcphoff; - tcph->dest = cp->dport; - - /* - * Adjust TCP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - tcphoff)); - } else if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcph->check = csum_ipv6_magic(&cp->caddr.in6, - &cp->daddr.in6, - skb->len - tcphoff, - cp->protocol, skb->csum); - else -#endif - tcph->check = csum_tcpudp_magic(cp->caddr.ip, - cp->daddr.ip, - skb->len - tcphoff, - cp->protocol, - skb->csum); - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - unsigned int tcphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - case CHECKSUM_COMPLETE: -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len - tcphoff, - ipv6_hdr(skb)->nexthdr, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - } else -#endif - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - tcphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - - return 1; -} - - -#define TCP_DIR_INPUT 0 -#define TCP_DIR_OUTPUT 4 -#define TCP_DIR_INPUT_ONLY 8 - -static const int tcp_state_off[IP_VS_DIR_LAST] = { - [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, - [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, - [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, -}; - -/* - * Timeout table[state] - */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 120*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = "NONE", - [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", - [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", - [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", - [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", - [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", - [IP_VS_TCP_S_CLOSE] = "CLOSE", - [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", - [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", - [IP_VS_TCP_S_LISTEN] = "LISTEN", - [IP_VS_TCP_S_SYNACK] = "SYNACK", - [IP_VS_TCP_S_LAST] = "BUG!", -}; - -#define sNO IP_VS_TCP_S_NONE -#define sES IP_VS_TCP_S_ESTABLISHED -#define sSS IP_VS_TCP_S_SYN_SENT -#define sSR IP_VS_TCP_S_SYN_RECV -#define sFW IP_VS_TCP_S_FIN_WAIT -#define sTW IP_VS_TCP_S_TIME_WAIT -#define sCL IP_VS_TCP_S_CLOSE -#define sCW IP_VS_TCP_S_CLOSE_WAIT -#define sLA IP_VS_TCP_S_LAST_ACK -#define sLI IP_VS_TCP_S_LISTEN -#define sSA IP_VS_TCP_S_SYNACK - -struct tcp_states_t { - int next_state[IP_VS_TCP_S_LAST]; -}; - -static const char * tcp_state_name(int state) -{ - if (state >= IP_VS_TCP_S_LAST) - return "ERR!"; - return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; -} - -static struct tcp_states_t tcp_states [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t tcp_states_dos [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ - int on = (flags & 1); /* secure_tcp */ - - /* - ** FIXME: change secure_tcp to independent sysctl var - ** or make it per-service or per-app because it is valid - ** for most if not for all of the applications. Something - ** like "capabilities" (flags) for each object. - */ - tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); -} - -static inline int tcp_state_idx(struct tcphdr *th) -{ - if (th->rst) - return 3; - if (th->syn) - return 0; - if (th->fin) - return 1; - if (th->ack) - return 2; - return -1; -} - -static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, - int direction, struct tcphdr *th) -{ - int state_idx; - int new_state = IP_VS_TCP_S_CLOSE; - int state_off = tcp_state_off[direction]; - - /* - * Update state offset to INPUT_ONLY if necessary - * or delete NO_OUTPUT flag if output packet detected - */ - if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { - if (state_off == TCP_DIR_OUTPUT) - cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; - else - state_off = TCP_DIR_INPUT_ONLY; - } - - if ((state_idx = tcp_state_idx(th)) < 0) { - IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); - goto tcp_state_out; - } - - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; - - tcp_state_out: - if (new_state != cp->state) { - struct ip_vs_dest *dest = cp->dest; - - IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" - "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, - ((state_off == TCP_DIR_OUTPUT) ? - "output " : "input "), - th->syn ? 'S' : '.', - th->fin ? 'F' : '.', - th->ack ? 'A' : '.', - th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->af, &cp->daddr), - ntohs(cp->dport), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - tcp_state_name(cp->state), - tcp_state_name(new_state), - atomic_read(&cp->refcnt)); - - if (dest) { - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - } - - cp->timeout = pp->timeout_table[cp->state = new_state]; -} - - -/* - * Handle state transitions - */ -static int -tcp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - struct tcphdr _tcph, *th; - -#ifdef CONFIG_IP_VS_IPV6 - int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); -#else - int ihl = ip_hdrlen(skb); -#endif - - th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - - spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); - spin_unlock(&cp->lock); - - return 1; -} - - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - -static inline __u16 tcp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) - & TCP_APP_TAB_MASK; -} - - -static int tcp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = tcp_app_hashkey(port); - - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); - - out: - spin_unlock_bh(&tcp_app_lock); - return ret; -} - - -static void -tcp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); -} - - -static int -tcp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = tcp_app_hashkey(cp->vport); - - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&tcp_app_lock); - - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" - "%s:%u to app %s on port %u\n", - __func__, - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), - ntohs(cp->vport), - inc->name, ntohs(inc->port)); - - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&tcp_app_lock); - - out: - return result; -} - - -/* - * Set LISTEN timeout. (ip_vs_conn_put will setup timer) - */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) -{ - spin_lock(&cp->lock); - cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; - spin_unlock(&cp->lock); -} - - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; -} - - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_tcp = { - .name = "TCP", - .protocol = IPPROTO_TCP, - .num_states = IP_VS_TCP_S_LAST, - .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, - .register_app = tcp_register_app, - .unregister_app = tcp_unregister_app, - .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, - .snat_handler = tcp_snat_handler, - .dnat_handler = tcp_dnat_handler, - .csum_check = tcp_csum_check, - .state_name = tcp_state_name, - .state_transition = tcp_state_transition, - .app_conn_bind = tcp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c deleted file mode 100644 index 6eb6039..0000000 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ /dev/null @@ -1,533 +0,0 @@ -/* - * ip_vs_proto_udp.c: UDP load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -static struct ip_vs_conn * -udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct udphdr _udph, *uh; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); - if (uh == NULL) { - *verdict = NF_DROP; - return 0; - } - - svc = ip_vs_service_get(af, skb->mark, iph.protocol, - &iph.daddr, uh->dest); - if (svc) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -udp_fast_csum_update(int af, struct udphdr *uhdr, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldport, __be16 newport) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - else -#endif - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - if (!uhdr->check) - uhdr->check = CSUM_MANGLED_0; -} - -static inline void -udp_partial_csum_update(int af, struct udphdr *uhdr, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldlen, __be16 newlen) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); - else -#endif - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); -} - - -static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - oldlen = skb->len - udphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Call application helper if needed - */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - udph = (void *)skb_network_header(skb) + udphoff; - udph->source = cp->vport; - - /* - * Adjust UDP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udph->check = csum_ipv6_magic(&cp->vaddr.in6, - &cp->caddr.in6, - skb->len - udphoff, - cp->protocol, skb->csum); - else -#endif - udph->check = csum_tcpudp_magic(cp->vaddr.ip, - cp->caddr.ip, - skb->len - udphoff, - cp->protocol, - skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, udph->check, - (char*)&(udph->check) - (char*)udph); - } - return 1; -} - - -static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - oldlen = skb->len - udphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - udph = (void *)skb_network_header(skb) + udphoff; - udph->dest = cp->dport; - - /* - * Adjust UDP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udph->check = csum_ipv6_magic(&cp->caddr.in6, - &cp->daddr.in6, - skb->len - udphoff, - cp->protocol, skb->csum); - else -#endif - udph->check = csum_tcpudp_magic(cp->caddr.ip, - cp->daddr.ip, - skb->len - udphoff, - cp->protocol, - skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - struct udphdr _udph, *uh; - unsigned int udphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - - uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); - if (uh == NULL) - return 0; - - if (uh->check != 0) { - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, udphoff, - skb->len - udphoff, 0); - case CHECKSUM_COMPLETE: -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len - udphoff, - ipv6_hdr(skb)->nexthdr, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - } else -#endif - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - udphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - } - return 1; -} - - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - -static inline __u16 udp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) - & UDP_APP_TAB_MASK; -} - - -static int udp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = udp_app_hashkey(port); - - - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); - - out: - spin_unlock_bh(&udp_app_lock); - return ret; -} - - -static void -udp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); -} - - -static int udp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = udp_app_hashkey(cp->vport); - - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&udp_app_lock); - - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" - "%s:%u to app %s on port %u\n", - __func__, - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), - ntohs(cp->vport), - inc->name, ntohs(inc->port)); - - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&udp_app_lock); - - out: - return result; -} - - -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = 5*60*HZ, - [IP_VS_UDP_S_LAST] = 2*HZ, -}; - -static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = "UDP", - [IP_VS_UDP_S_LAST] = "BUG!", -}; - - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - -static const char * udp_state_name(int state) -{ - if (state >= IP_VS_UDP_S_LAST) - return "ERR!"; - return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; -} - -static int -udp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; - return 1; -} - -static void udp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; -} - -static void udp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_udp = { - .name = "UDP", - .protocol = IPPROTO_UDP, - .num_states = IP_VS_UDP_S_LAST, - .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, - .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, - .snat_handler = udp_snat_handler, - .dnat_handler = udp_dnat_handler, - .csum_check = udp_csum_check, - .state_transition = udp_state_transition, - .state_name = udp_state_name, - .register_app = udp_register_app, - .unregister_app = udp_unregister_app, - .app_conn_bind = udp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c deleted file mode 100644 index a22195f..0000000 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * IPVS: Round-Robin Scheduling module - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes/Changes: - * Wensong Zhang : changed the ip_vs_rr_schedule to return dest - * Julian Anastasov : fixed the NULL pointer access bug in debugging - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_rr_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static int ip_vs_rr_init_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -/* - * Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct list_head *p, *q; - struct ip_vs_dest *dest; - - IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); - - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; - do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; - } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); - return NULL; - - out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); - IP_VS_DBG_BUF(6, "RR: server %s:%u " - "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); - - return dest; -} - - -static struct ip_vs_scheduler ip_vs_rr_scheduler = { - .name = "rr", /* name */ - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, - .schedule = ip_vs_rr_schedule, -}; - -static int __init ip_vs_rr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -static void __exit ip_vs_rr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -module_init(ip_vs_rr_init); -module_exit(ip_vs_rr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c deleted file mode 100644 index a46ad9e..0000000 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include - -#include - -/* - * IPVS scheduler list - */ -static LIST_HEAD(ip_vs_schedulers); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); - - -/* - * Bind a service with a scheduler - */ -int ip_vs_bind_scheduler(struct ip_vs_service *svc, - struct ip_vs_scheduler *scheduler) -{ - int ret; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - if (scheduler == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); - return -EINVAL; - } - - svc->scheduler = scheduler; - - if (scheduler->init_service) { - ret = scheduler->init_service(svc); - if (ret) { - IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); - return ret; - } - } - - return 0; -} - - -/* - * Unbind a service with its scheduler - */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) -{ - struct ip_vs_scheduler *sched; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - - sched = svc->scheduler; - if (sched == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); - return -EINVAL; - } - - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; -} - - -/* - * Get scheduler in the scheduler list by name - */ -static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", - sched_name); - - read_lock_bh(&__ip_vs_sched_lock); - - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - /* - * Test and get the modules atomically - */ - if (sched->module && !try_module_get(sched->module)) { - /* - * This scheduler is just deleted - */ - continue; - } - if (strcmp(sched_name, sched->name)==0) { - /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); - return sched; - } - if (sched->module) - module_put(sched->module); - } - - read_unlock_bh(&__ip_vs_sched_lock); - return NULL; -} - - -/* - * Lookup scheduler and try to load it if it doesn't exist - */ -struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - /* - * Search for the scheduler by sched_name - */ - sched = ip_vs_sched_getbyname(sched_name); - - /* - * If scheduler not found, load the module and search again - */ - if (sched == NULL) { - request_module("ip_vs_%s", sched_name); - sched = ip_vs_sched_getbyname(sched_name); - } - - return sched; -} - -void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) -{ - if (scheduler->module) - module_put(scheduler->module); -} - - -/* - * Register a scheduler in the scheduler list - */ -int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - struct ip_vs_scheduler *sched; - - if (!scheduler) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - if (!scheduler->name) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); - return -EINVAL; - } - - /* increase the module use count */ - ip_vs_use_count_inc(); - - write_lock_bh(&__ip_vs_sched_lock); - - if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already linked\n", scheduler->name); - return -EINVAL; - } - - /* - * Make sure that the scheduler with this name doesn't exist - * in the scheduler list. - */ - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already existed in the system\n", - scheduler->name); - return -EINVAL; - } - } - /* - * Add it into the d-linked scheduler list - */ - list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); - - IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); - - return 0; -} - - -/* - * Unregister a scheduler from the scheduler list - */ -int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - if (!scheduler) { - IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - write_lock_bh(&__ip_vs_sched_lock); - if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " - "is not in the list. failed\n", scheduler->name); - return -EINVAL; - } - - /* - * Remove it from the d-linked scheduler list - */ - list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c deleted file mode 100644 index 7d2f22f..0000000 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * IPVS: Shortest Expected Delay scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The SED algorithm attempts to minimize each job's expected delay until - * completion. The expected delay that the job will experience is - * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of - * jobs on the ith server and Ui is the fixed service rate (weight) of - * the ith server. The SED algorithm adopts a greedy policy that each does - * what is in its own best interest, i.e. to join the queue which would - * minimize its expected delay of completion. - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri for talking SED to me. - * - * The difference between SED and WLC is that SED includes the incoming - * job in the cost function (the increment of 1). SED may outperform - * WLC, while scheduling big jobs under larger heterogeneous systems - * (the server weight varies a lot). - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_sed_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG_BUF(6, "SED: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_sed_scheduler = -{ - .name = "sed", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_sed_schedule, -}; - - -static int __init ip_vs_sed_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -static void __exit ip_vs_sed_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -module_init(ip_vs_sed_init); -module_exit(ip_vs_sed_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c deleted file mode 100644 index 1d96de2..0000000 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * IPVS: Source Hashing scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The sh algorithm is to select server by the hash key of source IP - * address. The pseudo code is as follows: - * - * n <- servernode[src_ip]; - * if (n is dead) OR - * (n is overloaded) or (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet source IP address to the current server - * array. If the sh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include -#include -#include -#include - -#include - - -/* - * IPVS SH bucket - */ -struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS SH entry hash table - */ -#ifndef CONFIG_IP_VS_SH_TAB_BITS -#define CONFIG_IP_VS_SH_TAB_BITS 8 -#endif -#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS -#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) -#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS SH entry - */ -static inline unsigned ip_vs_sh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_sh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_sh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; idest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) -{ - int i; - struct ip_vs_sh_bucket *b; - - b = tbl; - for (i=0; idest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_sh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl; - - /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Source Hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(tbl, iph->saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->saddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS SH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_sh_scheduler = -{ - .name = "sh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_sh_init_svc, - .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, - .schedule = ip_vs_sh_schedule, -}; - - -static int __init ip_vs_sh_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -static void __exit ip_vs_sh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -module_init(ip_vs_sh_init); -module_exit(ip_vs_sh_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c deleted file mode 100644 index de5e7e1..0000000 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ /dev/null @@ -1,942 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * - * ip_vs_sync: sync connection info from master load balancer to backups - * through multicast - * - * Changes: - * Alexandre Cassen : Added master & backup support at a time. - * Alexandre Cassen : Added SyncID support for incoming sync - * messages filtering. - * Justin Ossevoort : Fix endian problem on sync message size. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for ip_mc_join_group */ -#include -#include -#include -#include -#include - -#include -#include - -#include - -#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ -#define IP_VS_SYNC_PORT 8848 /* multicast port */ - - -/* - * IPVS sync connection entry - */ -struct ip_vs_sync_conn { - __u8 reserved; - - /* Protocol, addresses and port numbers */ - __u8 protocol; /* Which protocol (TCP/UDP) */ - __be16 cport; - __be16 vport; - __be16 dport; - __be32 caddr; /* client address */ - __be32 vaddr; /* virtual address */ - __be32 daddr; /* destination address */ - - /* Flags and state transition */ - __be16 flags; /* status flags */ - __be16 state; /* state info */ - - /* The sequence options start here */ -}; - -struct ip_vs_sync_conn_options { - struct ip_vs_seq in_seq; /* incoming seq. struct */ - struct ip_vs_seq out_seq; /* outgoing seq. struct */ -}; - -struct ip_vs_sync_thread_data { - struct socket *sock; - char *buf; -}; - -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) -#define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) - - -/* - The master mulitcasts messages to the backup load balancers in the - following format. - - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (1) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | . | - | . | - | . | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (n) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -*/ - -#define SYNC_MESG_HEADER_LEN 4 -#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ - -struct ip_vs_sync_mesg { - __u8 nr_conns; - __u8 syncid; - __u16 size; - - /* ip_vs_sync_conn entries start here */ -}; - -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; - -struct ip_vs_sync_buff { - struct list_head list; - unsigned long firstuse; - - /* pointers for the message data */ - struct ip_vs_sync_mesg *mesg; - unsigned char *head; - unsigned char *end; -}; - - -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = __constant_htons(IP_VS_SYNC_PORT), - .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), -}; - - -static inline struct ip_vs_sync_buff *sb_dequeue(void) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { - sb = NULL; - } else { - sb = list_entry(ip_vs_sync_queue.next, - struct ip_vs_sync_buff, - list); - list_del(&sb->list); - } - spin_unlock_bh(&ip_vs_sync_lock); - - return sb; -} - -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) -{ - struct ip_vs_sync_buff *sb; - - if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) - return NULL; - - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { - kfree(sb); - return NULL; - } - sb->mesg->nr_conns = 0; - sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; - sb->firstuse = jiffies; - return sb; -} - -static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) -{ - kfree(sb->mesg); - kfree(sb); -} - -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) -{ - spin_lock(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ip_vs_sync_queue); - else - ip_vs_sync_buff_release(sb); - spin_unlock(&ip_vs_sync_lock); -} - -/* - * Get the current sync buffer if it has been created for more - * than the specified time or the specified time is zero. - */ -static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; - } else - sb = NULL; - spin_unlock_bh(&curr_sb_lock); - return sb; -} - - -/* - * Add an ip_vs_conn information into the current sync_buff. - * Called by ip_vs_in. - */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) -{ - struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; - int len; - - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); - IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); - return; - } - } - - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; - m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; - - /* copy members */ - s->protocol = cp->protocol; - s->cport = cp->cport; - s->vport = cp->vport; - s->dport = cp->dport; - s->caddr = cp->caddr.ip; - s->vaddr = cp->vaddr.ip; - s->daddr = cp->daddr.ip; - s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); - s->state = htons(cp->state); - if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { - struct ip_vs_sync_conn_options *opt = - (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); - } - - m->nr_conns++; - m->size += len; - curr_sb->head += len; - - /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; - } - spin_unlock(&curr_sb_lock); - - /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(cp->control); -} - - -/* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. - */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) -{ - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; - struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; - char *p; - int i; - - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } - - /* Convert size back to host byte order */ - m->size = ntohs(m->size); - - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; - } - - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; - } - - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); - for (i=0; inr_conns; i++) { - unsigned flags, state; - - if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); - return; - } - s = (struct ip_vs_sync_conn *) p; - flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; - flags &= ~IP_VS_CONN_F_HASHED; - if (flags & IP_VS_CONN_F_SEQ_MASK) { - opt = (struct ip_vs_sync_conn_options *)&s[1]; - p += FULL_CONN_SIZE; - if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); - return; - } - } else { - opt = NULL; - p += SIMPLE_CONN_SIZE; - } - - state = ntohs(s->state); - if (!(flags & IP_VS_CONN_F_TEMPLATE)) { - pp = ip_vs_proto_get(s->protocol); - if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", - s->protocol); - continue; - } - if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", - pp->name, state); - continue; - } - } else { - /* protocol in templates is not used for state/timeout */ - pp = NULL; - if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", - state); - state = 0; - } - } - - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - else - cp = ip_vs_ct_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(AF_INET, - (union nf_inet_addr *)&s->daddr, - s->dport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - s->protocol); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } - cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - (union nf_inet_addr *)&s->daddr, - s->dport, - flags, dest); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - IP_VS_ERR("ip_vs_conn_new failed\n"); - return; - } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); - } -} - - -/* - * Setup loopback of outgoing multicasts on a sending socket - */ -static void set_mcast_loop(struct sock *sk, u_char loop) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; - release_sock(sk); -} - -/* - * Specify TTL for outgoing multicasts on a sending socket - */ -static void set_mcast_ttl(struct sock *sk, u_char ttl) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ - lock_sock(sk); - inet->mc_ttl = ttl; - release_sock(sk); -} - -/* - * Specifiy default interface for outgoing multicasts - */ -static int set_mcast_if(struct sock *sk, char *ifname) -{ - struct net_device *dev; - struct inet_sock *inet = inet_sk(sk); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - lock_sock(sk); - inet->mc_index = dev->ifindex; - /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - - -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(int sync_state) -{ - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) - return -ENODEV; - - sync_recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); - } - - return 0; -} - - -/* - * Join a multicast group. - * the group is specified by a class D multicast address 224.0.0.0/8 - * in the in_addr structure passed in as a parameter. - */ -static int -join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) -{ - struct ip_mreqn mreq; - struct net_device *dev; - int ret; - - memset(&mreq, 0, sizeof(mreq)); - memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - mreq.imr_ifindex = dev->ifindex; - - lock_sock(sk); - ret = ip_mc_join_group(sk, &mreq); - release_sock(sk); - - return ret; -} - - -static int bind_mcastif_addr(struct socket *sock, char *ifname) -{ - struct net_device *dev; - __be32 addr; - struct sockaddr_in sin; - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); - if (!addr) - IP_VS_ERR("You probably need to specify IP address on " - "multicast interface.\n"); - - IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", - ifname, NIPQUAD(addr)); - - /* Now bind the socket with the address of multicast interface */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - sin.sin_port = 0; - - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); -} - -/* - * Set up sending multicast socket over UDP - */ -static struct socket * make_send_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error setting outbound mcast interface\n"); - goto error; - } - - set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); - - result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error binding address of the mcast interface\n"); - goto error; - } - - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); - if (result < 0) { - IP_VS_ERR("Error connecting to the multicast addr\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -/* - * Set up receiving multicast socket over UDP - */ -static struct socket * make_receive_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; - - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); - if (result < 0) { - IP_VS_ERR("Error binding to the multicast addr\n"); - goto error; - } - - /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error joining to the multicast group\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -static int -ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) -{ - struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; - struct kvec iov; - int len; - - EnterFunction(7); - iov.iov_base = (void *)buffer; - iov.iov_len = length; - - len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); - - LeaveFunction(7); - return len; -} - -static void -ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) -{ - int msize; - - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); - - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - IP_VS_ERR("ip_vs_send_async error\n"); -} - -static int -ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) -{ - struct msghdr msg = {NULL,}; - struct kvec iov; - int len; - - EnterFunction(7); - - /* Receive a packet */ - iov.iov_base = buffer; - iov.iov_len = (size_t)buflen; - - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); - - if (len < 0) - return -1; - - LeaveFunction(7); - return len; -} - - -static int sync_thread_master(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - struct ip_vs_sync_buff *sb; - - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); - - while (!kthread_should_stop()) { - while ((sb = sb_dequeue())) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - /* check if entries stay in curr_sb for 2 seconds */ - sb = get_curr_sync_buff(2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - schedule_timeout_interruptible(HZ); - } - - /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { - ip_vs_sync_buff_release(sb); - } - - /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { - ip_vs_sync_buff_release(sb); - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo); - - return 0; -} - - -static int sync_thread_backup(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - int len; - - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); - - while (!kthread_should_stop()) { - wait_event_interruptible(*tinfo->sock->sk->sk_sleep, - !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) - || kthread_should_stop()); - - /* do we have data now? */ - while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { - len = ip_vs_receive(tinfo->sock, tinfo->buf, - sync_recv_mesg_maxlen); - if (len <= 0) { - IP_VS_ERR("receiving message error\n"); - break; - } - - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); - ip_vs_process_message(tinfo->buf, len); - local_bh_enable(); - } - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); - - return 0; -} - - -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) -{ - struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; - struct socket *sock; - char *name, *buf = NULL; - int (*threadfn)(void *data); - int result = -ENOMEM; - - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); - - if (state == IP_VS_STATE_MASTER) { - if (sync_master_thread) - return -EEXIST; - - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - realtask = &sync_master_thread; - name = "ipvs_syncmaster"; - threadfn = sync_thread_master; - sock = make_send_sock(); - } else if (state == IP_VS_STATE_BACKUP) { - if (sync_backup_thread) - return -EEXIST; - - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - realtask = &sync_backup_thread; - name = "ipvs_syncbackup"; - threadfn = sync_thread_backup; - sock = make_receive_sock(); - } else { - return -EINVAL; - } - - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } - - set_sync_mesg_maxlen(state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; - } - - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->sock = sock; - tinfo->buf = buf; - - task = kthread_run(threadfn, tinfo, name); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; - } - - /* mark as active */ - *realtask = task; - ip_vs_sync_state |= state; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - return 0; - -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); -outsocket: - sock_release(sock); -out: - return result; -} - - -int stop_sync_thread(int state) -{ - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - - if (state == IP_VS_STATE_MASTER) { - if (!sync_master_thread) - return -ESRCH; - - IP_VS_INFO("stopping master sync thread %d ...\n", - task_pid_nr(sync_master_thread)); - - /* - * The lock synchronizes with sb_queue_tail(), so that we don't - * add sync buffers to the queue, when we are already in - * progress of stopping the master sync daemon. - */ - - spin_lock_bh(&ip_vs_sync_lock); - ip_vs_sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ip_vs_sync_lock); - kthread_stop(sync_master_thread); - sync_master_thread = NULL; - } else if (state == IP_VS_STATE_BACKUP) { - if (!sync_backup_thread) - return -ESRCH; - - IP_VS_INFO("stopping backup sync thread %d ...\n", - task_pid_nr(sync_backup_thread)); - - ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(sync_backup_thread); - sync_backup_thread = NULL; - } else { - return -EINVAL; - } - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c deleted file mode 100644 index 8c596e7..0000000 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * IPVS: Weighted Least-Connection Scheduling module - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest - * Wensong Zhang : changed to use the inactconns in scheduling - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wlc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_wlc_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_wlc_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG_BUF(6, "WLC: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_wlc_scheduler = -{ - .name = "wlc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_wlc_schedule, -}; - - -static int __init ip_vs_wlc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -static void __exit ip_vs_wlc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -module_init(ip_vs_wlc_init); -module_exit(ip_vs_wlc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c deleted file mode 100644 index 7ea92fe..0000000 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * IPVS: Weighted Round-Robin Scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wrr_update_svc - * Julian Anastasov : fixed the bug of returning destination - * with weight 0 when all weights are zero - * - */ - -#include -#include -#include - -#include - -/* - * current destination pointer for weighted round-robin scheduling - */ -struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ - int cw; /* current weight */ - int mw; /* maximum weight */ - int di; /* decreasing interval */ -}; - - -/* - * Get the gcd of server weights - */ -static int gcd(int a, int b) -{ - int c; - - while ((c = a % b)) { - a = b; - b = c; - } - return b; -} - -static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight; - int g = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - weight = atomic_read(&dest->weight); - if (weight > 0) { - if (g > 0) - g = gcd(weight, g); - else - g = weight; - } - } - return g ? g : 1; -} - - -/* - * Get the maximum weight of the service destinations. - */ -static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (atomic_read(&dest->weight) > weight) - weight = atomic_read(&dest->weight); - } - - return weight; -} - - -static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark; - - /* - * Allocate the mark variable for WRR scheduling - */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); - if (mark == NULL) { - IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); - return -ENOMEM; - } - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - svc->sched_data = mark; - - return 0; -} - - -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) -{ - /* - * Release the mark variable - */ - kfree(svc->sched_data); - - return 0; -} - - -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark = svc->sched_data; - - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; - return 0; -} - - -/* - * Weighted Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; - - IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); - - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; - while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - IP_VS_ERR_RL("ip_vs_wrr_schedule(): " - "no available servers\n"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } - } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - goto out; - } - } - - IP_VS_DBG_BUF(6, "WRR: server %s:%u " - "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), - atomic_read(&dest->weight)); - - out: - write_unlock(&svc->sched_lock); - return dest; -} - - -static struct ip_vs_scheduler ip_vs_wrr_scheduler = { - .name = "wrr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .init_service = ip_vs_wrr_init_svc, - .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, - .schedule = ip_vs_wrr_schedule, -}; - -static int __init ip_vs_wrr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; -} - -static void __exit ip_vs_wrr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); -} - -module_init(ip_vs_wrr_init); -module_exit(ip_vs_wrr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c deleted file mode 100644 index 02ddc2b..0000000 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ /dev/null @@ -1,1004 +0,0 @@ -/* - * ip_vs_xmit.c: various packet transmitters for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include /* for tcphdr */ -#include -#include /* for csum_tcpudp_magic */ -#include -#include /* for icmp_send */ -#include /* for ip_route_output */ -#include -#include -#include -#include -#include - -#include - - -/* - * Destination cache to speed up outgoing route lookup - */ -static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dst_release(old_dst); -} - -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) -{ - struct dst_entry *dst = dest->dst_cache; - - if (!dst) - return NULL; - if ((dst->obsolete - || (dest->af == AF_INET && rtos != dest->dst_rtos)) && - dst->ops->check(dst, cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); - return NULL; - } - dst_hold(dst); - return dst; -} - -static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) -{ - struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = dest->addr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, " - "dest: %u.%u.%u.%u\n", - NIPQUAD(dest->addr.ip)); - return NULL; - } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", - NIPQUAD(dest->addr.ip), - atomic_read(&rt->u.dst.__refcnt), rtos); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = cp->daddr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_route_output error, dest: " - "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); - return NULL; - } - } - - return rt; -} - -#ifdef CONFIG_IP_VS_IPV6 -static struct rt6_info * -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); - if (!rt) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = dest->addr.in6, - .saddr = { - .s6_addr32 = - { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, - NULL, &fl); - if (!rt) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip6_route_output error, " - "dest: " NIP6_FMT "\n", - NIP6(dest->addr.in6)); - return NULL; - } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", - NIP6(dest->addr.in6), - atomic_read(&rt->u.dst.__refcnt)); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = cp->daddr.in6, - .saddr = { - .s6_addr32 = { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip6_route_output error, dest: " - NIP6_FMT "\n", NIP6(cp->daddr.in6)); - return NULL; - } - } - - return rt; -} -#endif - - -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); -} - -#define IP_VS_XMIT(pf, skb, rt) \ -do { \ - (skb)->ipvs_property = 1; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ -} while (0) - - -/* - * NULL transmitter (do nothing except return NF_ACCEPT) - */ -int -ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; -} - - -/* - * Bypass transmitter - * Let packets bypass the destination when the destination is not - * available, it may be only used in transparent cache cluster. - */ -int -ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; - - EnterFunction(10); - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " - "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct ipv6hdr *iph = ipv6_hdr(skb); - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; - - EnterFunction(10); - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " - "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - -/* - * NAT transmitter (only for outside-to-inside nat forwarding) - * Not used for related ICMP - */ -int -ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ip_hdr(skb)->daddr = cp->daddr.ip; - ip_send_check(ip_hdr(skb)); - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, sizeof(struct ipv6hdr), - sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ipv6_hdr(skb)->daddr = cp->daddr.in6; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; -} -#endif - - -/* - * IP Tunneling transmitter - * - * This function encapsulates the packet in a new IP packet, its - * destination will be set to cp->daddr. Most code of this function - * is taken from ipip.c. - * - * It is used in VS/TUN cluster. The load balancer selects a real - * server from a cluster based on a scheduling algorithm, - * encapsulates the request packet and forwards it to the selected - * server. For example, all real servers are configured with - * "ifconfig tunl0 up". When the server receives - * the encapsulated packet, it will decapsulate the packet, processe - * the request and return the response packets directly to the client - * without passing the load balancer. This can greatly increase the - * scalability of virtual server. - * - * Used for ANY protocol - */ -int -ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - htons(ETH_P_IP), skb->protocol); - goto tx_error; - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - if (mtu < 68) { - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - skb->transport_header = old_transport_header; - - /* fix old IP header checksum */ - ip_send_check(old_iph); - - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *old_iph = ipv6_hdr(skb); - sk_buff_data_t old_transport_header = skb->transport_header; - struct ipv6hdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IPV6)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " - "ETH_P_IPV6: %d, skb protocol: %d\n", - htons(ETH_P_IPV6), skb->protocol); - goto tx_error; - } - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); - /* TODO IPv6: do we need this check in IPv6? */ - if (mtu < 1280) { - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->u.dst); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ipv6_hdr(skb); - } - - skb->transport_header = old_transport_header; - - skb_push(skb, sizeof(struct ipv6hdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ipv6_hdr(skb); - iph->version = 6; - iph->nexthdr = IPPROTO_IPV6; - iph->payload_len = old_iph->payload_len + sizeof(old_iph); - iph->priority = old_iph->priority; - memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - iph->daddr = rt->rt6i_dst.addr; - iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ - iph->hop_limit = old_iph->hop_limit; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip6_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - - -/* - * Direct Routing transmitter - * Used for ANY protocol - */ -int -ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; - - EnterFunction(10); - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - - EnterFunction(10); - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - - -/* - * ICMP packet transmitter - * called by the ip_vs_in_icmp - */ -int -ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - rc = NF_STOLEN; - goto out; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; - out: - LeaveFunction(10); - return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp_v6(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - rc = NF_STOLEN; - goto out; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; -out: - LeaveFunction(10); - return rc; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; -} -#endif diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ee898e7..73f9378 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -838,3 +838,5 @@ config NETFILTER_XT_MATCH_HASHLIMIT endmenu +source "net/netfilter/ipvs/Kconfig" + diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3bd2cc5..cf75055 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -83,3 +83,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o + +# IPVS +obj-$(CONFIG_IP_VS) += ipvs/ diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig new file mode 100644 index 0000000..de6004d --- /dev/null +++ b/net/netfilter/ipvs/Kconfig @@ -0,0 +1,239 @@ +# +# IP Virtual Server configuration +# +menuconfig IP_VS + tristate "IP virtual server support (EXPERIMENTAL)" + depends on NETFILTER + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: . + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +if IP_VS + +config IP_VS_IPV6 + bool "IPv6 support for IPVS (DANGEROUS)" + depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + ---help--- + Add IPv6 support to IPVS. This is incomplete and might be dangerous. + + Say N if unsure. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + range 8 20 + default 12 + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +comment "IPVS transport protocol load balancing support" + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_AH_ESP + bool + depends on UNDEFINED + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + select IP_VS_PROTO_AH_ESP + ---help--- + This option enables support for load balancing ESP (Encapsulation + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + select IP_VS_PROTO_AH_ESP + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +comment "IPVS scheduler" + +config IP_VS_RR + tristate "round-robin scheduling" + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS application helper' + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS_PROTO_TCP + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile new file mode 100644 index 0000000..73a46fe --- /dev/null +++ b/net/netfilter/ipvs/Makefile @@ -0,0 +1,33 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o \ + $(ip_vs_proto-objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c new file mode 100644 index 0000000..201b8ea --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -0,0 +1,622 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +/* ipvs application list head */ +static LIST_HEAD(ip_vs_app_list); +static DEFINE_MUTEX(__ip_vs_app_mutex); + + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + return try_module_get(app->module); +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); + if (!inc) + return -ENOMEM; + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s application %s:%u registered\n", + pp->name, inc->name, inc->port); + + return 0; + + out: + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, inc->port); + + list_del(&inc->a_list); + + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + int result; + + mutex_lock(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(app, proto, port); + + mutex_unlock(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct ip_vs_app *app) +{ + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ip_vs_app_list); + + mutex_unlock(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct ip_vs_app *app) +{ + struct ip_vs_app *inc, *nxt; + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { + ip_vs_app_inc_release(inc); + } + + list_del(&app->a_list); + + mutex_unlock(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", + vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " + "(%d) to seq\n", vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " + "(%d) from ack_seq\n", vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + "previous_delta (%d) from ack_seq\n", + vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, skb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, skb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, skb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, skb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + mutex_lock(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static const struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_app_seq_ops); +} + +static const struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +/* + * Replace a segment of data with a new segment + */ +int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, + char *o_buf, int o_len, char *n_buf, int n_len) +{ + int diff; + int o_offset; + int o_left; + + EnterFunction(9); + + diff = n_len - o_len; + o_offset = o_buf - (char *)skb->data; + /* The length of left data after o_buf+o_len in the skb data */ + o_left = skb->len - (o_offset + o_len); + + if (diff <= 0) { + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + skb_trim(skb, skb->len + diff); + } else if (diff <= skb_tailroom(skb)) { + skb_put(skb, diff); + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + } else { + if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) + return -ENOMEM; + skb_put(skb, diff); + memmove(skb->data + o_offset + n_len, + skb->data + o_offset + o_len, o_left); + skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); + } + + /* must update the iph total length here */ + ip_hdr(skb)->tot_len = htons(skb->len); + + LeaveFunction(9); + return 0; +} + + +int __init ip_vs_app_init(void) +{ + /* we will replace it with proc_net_ipvs_create() soon */ + proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + + +void ip_vs_app_cleanup(void) +{ + proc_net_remove(&init_net, "ip_vs_app"); +} diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c new file mode 100644 index 0000000..9a24332 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -0,0 +1,1110 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include /* for proc_net_* */ +#include +#include +#include + +#include +#include + + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct list_head *ip_vs_conn_tab; + +/* SLAB cache for IPVS connections */ +static struct kmem_cache *ip_vs_conn_cachep __read_mostly; + +/* counter for current IPVS connections */ +static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_SIZE (1<ip, (__force u32)port, proto, + ip_vs_conn_rnd) + & IP_VS_CONN_TAB_MASK; +} + + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + ret = 1; + } else { + IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + ret = 0; + } + + ct_write_unlock(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + list_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + ct_write_unlock(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn *__ip_vs_conn_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && + ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) + cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, + d_port); + + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); + + return cp; +} + +/* Get reference to connection template */ +struct ip_vs_conn *ip_vs_ct_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + goto out; + } + } + cp = NULL; + + out: + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); + + return cp; +} + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + */ +struct ip_vs_conn *ip_vs_conn_out_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, d_addr, &cp->caddr) && + ip_vs_addr_equal(af, s_addr, &cp->daddr) && + d_port == cp->cport && s_port == cp->dport && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ret ? "hit" : "not hit"); + + return ret; +} + + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + /* reset it expire in its timeout */ + mod_timer(&cp->timer, jiffies+cp->timeout); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + /* Bind with the destination and its corresponding transmitter */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + cp->flags |= atomic_read(&dest->conn_flags) & + (~IP_VS_CONN_F_INACTIVE); + else + cp->flags |= atomic_read(&dest->conn_flags); + cp->dest = dest; + + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the peristent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, + &cp->vaddr, cp->vport, + cp->protocol); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the peristent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + (sysctl_ip_vs_expire_quiescent_template && + (atomic_read(&dest->weight) == 0))) { + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->vport != htons(0xffff)) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = htons(0xffff); + ct->vport = htons(0xffff); + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + if (!ip_vs_conn_unhash(cp)) + goto expire_later; + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* delete the timer if it is activated by other users */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ip_vs_conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) + mod_timer(&cp->timer, jiffies); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + struct ip_vs_dest *dest) +{ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + return NULL; + } + + INIT_LIST_HEAD(&cp->c_list); + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + cp->af = af; + cp->protocol = proto; + ip_vs_addr_copy(af, &cp->caddr, caddr); + cp->cport = cport; + ip_vs_addr_copy(af, &cp->vaddr, vaddr); + cp->vport = vport; + ip_vs_addr_copy(af, &cp->daddr, daddr); + cp->dport = dport; + cp->flags = flags; + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ip_vs_conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + if (unlikely(pp && atomic_read(&pp->appcnt))) + ip_vs_bind_app(cp, pp); + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + + for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + if (pos-- == 0) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + } + ct_read_unlock_bh(idx); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +{ + seq->private = NULL; + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct list_head *e, *l = seq->private; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + if ((e = cp->c_list.next) != l) + return list_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + ct_read_unlock_bh(idx); + + while (++idx < IP_VS_CONN_TAB_SIZE) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + ct_read_unlock_bh(idx); + } + seq->private = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +{ + struct list_head *l = seq->private; + + if (l) + ct_read_unlock_bh(l - ip_vs_conn_tab); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + else { + const struct ip_vs_conn *cp = v; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_seq_ops); +} + +static const struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const char *ip_vs_origin_name(unsigned flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_sync_seq_ops); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +/* Called from keventd and must protect itself from softirqs */ +void ip_vs_random_dropentry(void) +{ + int idx; + struct ip_vs_conn *cp; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { + unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + /* connection template */ + continue; + + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(void) +{ + int idx; + struct ip_vs_conn *cp; + + flush_again: + for (idx=0; idxcontrol) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ip_vs_conn_count) != 0) { + schedule(); + goto flush_again; + } +} + + +int __init ip_vs_conn_init(void) +{ + int idx; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_CONN_TAB_SIZE, + (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + } + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + + +void ip_vs_conn_cleanup(void) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(); + + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + proc_net_remove(&init_net, "ip_vs_conn"); + proc_net_remove(&init_net, "ip_vs_conn_sync"); + vfree(ip_vs_conn_tab); +} diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c new file mode 100644 index 0000000..958abf3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -0,0 +1,1542 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for icmp_send */ +#include + +#include +#include + +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif + +#include + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_skb_replace); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif + + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_ICMP: + return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.ustats.inpkts++; + dest->stats.ustats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.ustats.inpkts++; + dest->svc->stats.ustats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.inpkts++; + ip_vs_stats.ustats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.ustats.outpkts++; + dest->stats.ustats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.ustats.outpkts++; + dest->svc->stats.ustats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.outpkts++; + ip_vs_stats.ustats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + spin_lock(&cp->dest->stats.lock); + cp->dest->stats.ustats.conns++; + spin_unlock(&cp->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.ustats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + + +static inline int +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + if (unlikely(!pp->state_transition)) + return 0; + return pp->state_transition(cp, direction, skb, pp); +} + + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + const struct sk_buff *skb, + __be16 ports[2]) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __be16 dport; /* destination port to forward */ + union nf_inet_addr snet; /* source network of the client, + after masking */ + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + /* Mask saddr with the netmask to adjust template granularity */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + else +#endif + snet.ip = iph.saddr.ip & svc->netmask; + + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + if (ports[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, ports[1]); + else + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + */ + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template like for non-ftp service, + * and + * for ftp service. + */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, + ports[1], + &dest->addr, dest->port, + IP_VS_CONN_F_TEMPLATE, + dest); + else + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: + * port zero template: + */ + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, + &fwmark, 0); + } else + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * If it is not persistent port zero, return NULL, + * otherwise create a connection template. + */ + if (svc->port) + return NULL; + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_conn_new(svc->af, IPPROTO_IP, + &snet, 0, + &fwmark, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + } else + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = ports[1]; + } + + /* + * Create a new connection according to the template + */ + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, ports[0], + &iph.daddr, ports[1], + &dest->addr, dport, + 0, + dest); + if (cp == NULL) { + ip_vs_conn_put(ct); + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + __be16 _ports[2], *pptr; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr); + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a connection entry. + */ + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &dest->addr, dest->port ? dest->port : pptr[1], + 0, + dest); + if (cp == NULL) + return NULL; + + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + __be16 _ports[2], *pptr; + struct ip_vs_iphdr iph; + int unicast; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) { + ip_vs_service_put(svc); + return NF_DROP; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is a non-local unicast, then create + a cache_bypass connection entry */ + if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { + int ret, cs; + struct ip_vs_conn *cp; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &daddr, 0, + IP_VS_CONN_F_BYPASS, + NULL); + if (cp == NULL) + return NF_DROP; + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, + skb->dev); + else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + return NF_DROP; +} + + +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING + * chain, and is used for VS/NAT. + * It detects packets for VS/NAT connections and sends the packets + * immediately. This can avoid that iptable_nat mangles the packets + * for VS/NAT. + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!skb->ipvs_property) + return NF_ACCEPT; + /* The packet was sent from IPVS, exit this chain */ + return NF_STOP; +} + +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err = ip_defrag(skb, user); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +{ + /* TODO IPv6: Find out what to do here for IPv6 */ + return 0; +} +#endif + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.ip; + ip_send_check(iph); + ciph->daddr = cp->vaddr.ip; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr.ip; + ip_send_check(iph); + ciph->saddr = cp->daddr.ip; + ip_send_check(ciph); + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { + __be16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = sizeof(struct ipv6hdr); + struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + + icmp_offset); + struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { + __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = 0; + /* TODO IPv6: is this correct for ICMPv6? */ + ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + +/* Handle relevant response ICMP messages - forward to the right + * destination host. Used for NAT and local client. + */ +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); + goto out; + } + + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_out_icmp(struct sk_buff *skb, int *related) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, offset, ihl); +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + ipv6_addr_copy(&snet.in6, &iph->saddr); + return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, + pp, offset, sizeof(struct ipv6hdr)); +} +#endif + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +/* Handle response packets: rewrite addresses and send away... + * Used for NAT and local client. + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int ihl) +{ + IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else +#endif + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_conn_put(cp); + + skb->ipvs_property = 1; + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + return NF_STOLEN; +} + +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if outgoing packet belongs to the established ip_vs_conn. + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int af; + + EnterFunction(11); + + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + + if (skb->ipvs_property) + return NF_ACCEPT; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* reassemble IP fragments */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); + + if (related) + return verdict; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + + if (unlikely(!cp)) { + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if (iph.protocol != IPPROTO_TCP + || !is_tcp_reset(skb, iph.len)) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0, skb->dev); + else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + return handle_response(af, skb, pp, cp, iph.len); +} + + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl, verdict; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (cp) { + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, + cih->protocol, cp, pp, + offset, ihl); + } + return NF_ACCEPT; + } + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); + /* do not touch skb anymore */ + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int +ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, verdict; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : + IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (cp) { + ipv6_addr_copy(&snet.in6, &iph->saddr); + return handle_response_icmp(AF_INET6, skb, &snet, + cih->nexthdr, + cp, pp, offset, + sizeof(struct ipv6hdr)); + } + return NF_ACCEPT; + } + + verdict = NF_DROP; + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); + /* do not touch skb anymore */ + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int ret, restart, af; + + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* + * Big tappo: only PACKET_HOST, including loopback for local client + * Don't handle local packets on IPv6 for now + */ + if (unlikely(skb->pkt_type != PACKET_HOST)) { + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", + skb->pkt_type, + iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr)); + return NF_ACCEPT; + } + + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* Protocol supported? */ + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); + + if (unlikely(!cp)) { + int v; + + /* For local client packets, it could be a response */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + if (cp) + return handle_response(af, skb, pp, cp, iph.len); + + if (!pp->conn_schedule(af, skb, pp, &v, &cp)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_ip_vs_expire_nodest_conn) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + */ + atomic_inc(&cp->in_pkts); + if (af == AF_INET && + (ip_vs_sync_state & IP_VS_STATE_MASTER) && + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || + (cp->state == IP_VS_TCP_S_TIME_WAIT))))) + ip_vs_sync_conn(cp); + cp->old_state = cp->state; + + ip_vs_conn_put(cp); + return ret; +} + + +/* + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + return ip_vs_in_icmp(skb, &r, hooknum); +} + +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, hooknum); +} +#endif + + +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, +#endif +}; + + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ip_vs_estimator_init(); + + ret = ip_vs_control_init(); + if (ret < 0) { + IP_VS_ERR("can't setup control.\n"); + goto cleanup_estimator; + } + + ip_vs_protocol_init(); + + ret = ip_vs_app_init(); + if (ret < 0) { + IP_VS_ERR("can't setup application helper.\n"); + goto cleanup_protocol; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + IP_VS_ERR("can't setup connection table.\n"); + goto cleanup_app; + } + + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) { + IP_VS_ERR("can't register hooks.\n"); + goto cleanup_conn; + } + + IP_VS_INFO("ipvs loaded.\n"); + return ret; + + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + cleanup_estimator: + ip_vs_estimator_cleanup(); + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ip_vs_conn_cleanup(); + ip_vs_app_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + ip_vs_estimator_cleanup(); + IP_VS_INFO("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c new file mode 100644 index 0000000..0302cf3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -0,0 +1,3443 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif +#include +#include +#include + +#include + +#include + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DEFINE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_svc_lock); + +/* lock for table with the real services */ +static DEFINE_RWLOCK(__ip_vs_rs_lock); + +/* lock for state and timeout tables */ +static DEFINE_RWLOCK(__ip_vs_securetcp_lock); + +/* lock for drop entry handling */ +static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); + +/* lock for drop packet handling */ +static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); + +/* 1/rate drop and drop-entry variables */ +int ip_vs_drop_rate = 0; +int ip_vs_drop_counter = 0; +static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); + +/* number of virtual services */ +static int ip_vs_num_services = 0; + +/* sysctl variables */ +static int sysctl_ip_vs_drop_entry = 0; +static int sysctl_ip_vs_drop_packet = 0; +static int sysctl_ip_vs_secure_tcp = 0; +static int sysctl_ip_vs_amemthresh = 1024; +static int sysctl_ip_vs_am_droprate = 10; +int sysctl_ip_vs_cache_bypass = 0; +int sysctl_ip_vs_expire_nodest_conn = 0; +int sysctl_ip_vs_expire_quiescent_template = 0; +int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; +int sysctl_ip_vs_nat_icmp_send = 0; + + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = *addr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) + return 1; + + return 0; +} +#endif +/* + * update_defense_level is called from keventd and from sysctl, + * so it needs to protect itself from softirqs + */ +static void update_defense_level(void) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < sysctl_ip_vs_amemthresh); + + local_bh_disable(); + + /* drop_entry */ + spin_lock(&__ip_vs_dropentry_lock); + switch (sysctl_ip_vs_drop_entry) { + case 0: + atomic_set(&ip_vs_dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + sysctl_ip_vs_drop_entry = 2; + } else { + atomic_set(&ip_vs_dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + } else { + atomic_set(&ip_vs_dropentry, 0); + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ip_vs_dropentry, 1); + break; + } + spin_unlock(&__ip_vs_dropentry_lock); + + /* drop_packet */ + spin_lock(&__ip_vs_droppacket_lock); + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_vs_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_vs_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + } else { + ip_vs_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + spin_unlock(&__ip_vs_droppacket_lock); + + /* secure_tcp */ + write_lock(&__ip_vs_securetcp_lock); + switch (sysctl_ip_vs_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + sysctl_ip_vs_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = sysctl_ip_vs_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); + write_unlock(&__ip_vs_securetcp_lock); + + local_bh_enable(); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ +static void defense_work_handler(struct work_struct *work); +static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); + +static void defense_work_handler(struct work_struct *work) +{ + update_defense_level(); + if (atomic_read(&ip_vs_dropentry)) + ip_vs_random_dropentry(); + + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); +} + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * Trash for destinations + */ +static LIST_HEAD(ip_vs_dest_trash); + +/* + * FTP & NULL virtual service counters + */ +static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, + svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the ip_vs_svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the ip_vs_svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {proto,addr,port} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, + __be16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); + + list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) + && (svc->port == vport) + && (svc->protocol == protocol)) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_svc_fwm_get(int af, __u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) + goto out; + + /* + * Check the table hashed by + * for "full" addressed entries + */ + svc = __ip_vs_service_get(af, protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_get(af, protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) + kfree(svc); +} + + +/* + * Returns hash value for real service + */ +static inline unsigned ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by . + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(int af, __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + read_lock(&__ip_vs_rs_lock); + list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { + if ((dest->af == af) + && ip_vs_addr_equal(af, &dest->addr, daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&__ip_vs_rs_lock); + return dest; + } + } + read_unlock(&__ip_vs_rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + + svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); + if (!svc) + return NULL; + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest, *nxt; + + /* + * Find the destination in trash + */ + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " + "from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(void) +{ + struct ip_vs_dest *dest, *nxt; + + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } +} + + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + + memset(&stats->ustats, 0, sizeof(stats->ustats)); + ip_vs_zero_estimator(stats); + + spin_unlock_bh(&stats->lock); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) +{ + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; + + /* check if local node and update the flags */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + } else +#endif + if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && + !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(&init_net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } + + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return -ENOMEM; + } + + dest->af = svc->af; + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest); + ip_vs_new_estimator(&dest->stats); + + *dest_p = dest; + + LeaveFunction(2); + return 0; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + + __ip_vs_update_dest(svc, dest, udest); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + if (ret) { + return ret; + } + + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* call the update_service, because server weight may be changed */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + ip_vs_kill_estimator(&dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree(dest); + } else { + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " + "dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + + /* + * Call the update_service function of its scheduler + */ + if (svcupd && svc->scheduler->update_service) + svc->scheduler->update_service(svc); +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + __be16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_service *svc = NULL; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_mod_dec; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out_err; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out_err; + } + } +#endif + + svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out_err; + } + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 1); + atomic_set(&svc->refcnt, 0); + + svc->af = u->af; + svc->protocol = u->protocol; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + INIT_LIST_HEAD(&svc->destinations); + rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + ip_vs_new_estimator(&svc->stats); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + return 0; + + out_err: + if (svc != NULL) { + if (svc->scheduler) + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + kfree(svc); + } + ip_vs_scheduler_put(sched); + + out_mod_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } + old_sched = sched; + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out; + } + } +#endif + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out_unlock; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out_unlock; + } + } + + out_unlock: + write_unlock_bh(&__ip_vs_svc_lock); +#ifdef CONFIG_IP_VS_IPV6 + out: +#endif + + if (old_sched) + ip_vs_scheduler_put(old_sched); + + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services--; + + ip_vs_kill_estimator(&svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + if (old_sched) + ip_vs_scheduler_put(old_sched); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) + kfree(svc); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc, *nxt; + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, + &ip_vs_svc_fwm_table[idx], f_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + return 0; +} + + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(void) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&ip_vs_stats); + return 0; +} + + +static int +proc_do_defense_mode(ctl_table *table, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + update_defense_level(); + } + } + return rc; +} + + +static int +proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + */ + +static struct ctl_table vs_vars[] = { + { + .procname = "amemthresh", + .data = &sysctl_ip_vs_amemthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "am_droprate", + .data = &sysctl_ip_vs_am_droprate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "drop_entry", + .data = &sysctl_ip_vs_drop_entry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .data = &sysctl_ip_vs_drop_packet, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .procname = "secure_tcp", + .data = &sysctl_ip_vs_secure_tcp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, +#if 0 + { + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, +#endif + { + .procname = "cache_bypass", + .data = &sysctl_ip_vs_cache_bypass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .data = &sysctl_ip_vs_expire_nodest_conn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .data = &sysctl_ip_vs_expire_quiescent_template, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_threshold", + .data = &sysctl_ip_vs_sync_threshold, + .maxlen = sizeof(sysctl_ip_vs_sync_threshold), + .mode = 0644, + .proc_handler = &proc_do_sync_threshold, + }, + { + .procname = "nat_icmp_send", + .data = &sysctl_ip_vs_nat_icmp_send, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +const struct ctl_path net_vs_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "vs", }, + { } +}; +EXPORT_SYMBOL_GPL(net_vs_ctl_path); + +static struct ctl_table_header * sysctl_header; + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct list_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (pos-- == 0){ + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +__acquires(__ip_vs_svc_lock) +{ + + read_lock_bh(&__ip_vs_svc_lock); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, s_list); + + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +__releases(__ip_vs_svc_lock) +{ + read_unlock_bh(&__ip_vs_svc_lock); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", + ip_vs_proto_name(svc->protocol), + NIP6(svc->addr.in6), + ntohs(svc->port), + svc->scheduler->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + svc->scheduler->name); + } else { + seq_printf(seq, "FWM %08X %s ", + svc->fwmark, svc->scheduler->name); + } + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry(dest, &svc->destinations, n_list) { +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [" NIP6_FMT "]:%04X" + " %-7s %-6d %-10d %-10d\n", + NIP6(dest->addr.in6), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + + } + } + return 0; +} + +static const struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); +} + +static const struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +struct ip_vs_stats ip_vs_stats = { + .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), +}; + +#ifdef CONFIG_PROC_FS +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + spin_lock_bh(&ip_vs_stats.lock); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, + ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, + (unsigned long long) ip_vs_stats.ustats.inbytes, + (unsigned long long) ip_vs_stats.ustats.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq,"%8X %8X %8X %16X %16X\n", + ip_vs_stats.ustats.cps, + ip_vs_stats.ustats.inpps, + ip_vs_stats.ustats.outpps, + ip_vs_stats.ustats.inbps, + ip_vs_stats.ustats.outbps); + spin_unlock_bh(&ip_vs_stats.lock); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_vs_stats_show, NULL); +} + +static const struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +{ + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (len != set_arglen[SET_CMDID(cmd)]) { + IP_VS_ERR("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (mutex_lock_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = stop_sync_thread(dm->state); + goto out_unlock; + } + + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { + ret = ip_vs_zero_all(); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { + IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", + usvc.protocol, NIPQUAD(usvc.addr.ip), + ntohs(usvc.port), usvc.sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc.protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(&usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, &udest); + break; + default: + ret = -EINVAL; + } + + if (svc) + ip_vs_service_put(svc); + + out_unlock: + mutex_unlock(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ + spin_lock_bh(&src->lock); + memcpy(dst, &src->ustats, sizeof(*dst)); + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr.ip; + dst->port = src->port; + dst->fwmark = src->fwmark; + strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); + else + svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, + get->port); + + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr.ip; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + ip_vs_service_put(svc); + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + u->tcp_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + u->udp_timeout = + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + IP_VS_ERR("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) + return -EFAULT; + + if (mutex_lock_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = IP_VS_CONN_TAB_SIZE; + info.num_services = ip_vs_num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + union nf_inet_addr addr; + + entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); + else + svc = __ip_vs_service_get(AF_INET, entry->protocol, + &addr, entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + ip_vs_service_put(svc); + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); + d[0].syncid = ip_vs_master_syncid; + } + if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); + d[1].syncid = ip_vs_backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, +}; + +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + spin_lock_bh(&stats->lock); + + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); + + spin_unlock_bh(&stats->lock); + + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + spin_unlock_bh(&stats->lock); + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); + + if (svc->fwmark) { + NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + } else { + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); + NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + } + + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + usvc->af = nla_get_u16(nla_af); +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_u16(nla_port); + usvc->fwmark = 0; + } + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + struct ip_vs_service *svc; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_get(usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + if (svc) { + usvc->flags = svc->flags; + ip_vs_service_put(svc); + } else + usvc->flags = 0; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + usvc->sched_name = nla_data(nla_sched); + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_u32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +{ + struct ip_vs_service_user_kern usvc; + int ret; + + ret = ip_vs_genl_parse_service(&usvc, nla, 0); + if (ret) + return ERR_PTR(ret); + + if (usvc.fwmark) + return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + else + return __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); + NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, + atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + ip_vs_service_put(svc); + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_u16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); + NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); + + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + mutex_lock(&__ip_vs_mutex); + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ip_vs_master_mcast_ifn, + ip_vs_master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ip_vs_backup_mcast_ifn, + ip_vs_backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(&t); +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(info->attrs); + goto out; + } else if (cmd == IPVS_CMD_NEW_DAEMON || + cmd == IPVS_CMD_DEL_DAEMON) { + + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(daemon_attrs); + else + ret = ip_vs_genl_del_daemon(daemon_attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(&usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc); + if (ret) + goto out; + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(&usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + if (svc) + ip_vs_service_put(svc); + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + IP_VS_ERR("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + ip_vs_service_put(svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); +#ifdef CONFIG_IP_VS_PROTO_TCP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); + NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + IP_VS_CONN_TAB_SIZE); + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_unicast(msg, info->snd_pid); + goto out; + +nla_put_failure: + IP_VS_ERR("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static struct genl_ops ip_vs_genl_ops[] __read_mostly = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + int ret, i; + + ret = genl_register_family(&ip_vs_genl_family); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { + ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); + if (ret) + goto err_out; + } + return 0; + +err_out: + genl_unregister_family(&ip_vs_genl_family); + return ret; +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + + +int __init ip_vs_control_init(void) +{ + int ret; + int idx; + + EnterFunction(2); + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + IP_VS_ERR("cannot register sockopt.\n"); + return ret; + } + + ret = ip_vs_genl_register(); + if (ret) { + IP_VS_ERR("cannot register Generic Netlink interface.\n"); + nf_unregister_sockopt(&ip_vs_sockopts); + return ret; + } + + proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); + + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + + ip_vs_new_estimator(&ip_vs_stats); + + /* Hook the defense timer */ + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + ip_vs_trash_cleanup(); + cancel_rearming_delayed_work(&defense_work); + cancel_work_sync(&defense_work.work); + ip_vs_kill_estimator(&ip_vs_stats); + unregister_sysctl_table(sysctl_header); + proc_net_remove(&init_net, "ip_vs_stats"); + proc_net_remove(&init_net, "ip_vs"); + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c new file mode 100644 index 0000000..a16943f --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -0,0 +1,261 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Authors: Wensong Zhang + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include + +#include + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) +{ + return (tbl[ip_vs_dh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + struct iphdr *iph = ip_hdr(skb); + + IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(tbl, iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c new file mode 100644 index 0000000..2eb2860 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -0,0 +1,166 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +static void estimation_timer(unsigned long arg); + +static LIST_HEAD(est_list); +static DEFINE_SPINLOCK(est_lock); +static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + + spin_lock(&est_lock); + list_for_each_entry(e, &est_list, list) { + s = container_of(e, struct ip_vs_stats, est); + + spin_lock(&s->lock); + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns)<<9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps)>>2; + s->ustats.cps = (e->cps+0x1FF)>>10; + + rate = (n_inpkts - e->last_inpkts)<<9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps)>>2; + s->ustats.inpps = (e->inpps+0x1FF)>>10; + + rate = (n_outpkts - e->last_outpkts)<<9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps)>>2; + s->ustats.outpps = (e->outpps+0x1FF)>>10; + + rate = (n_inbytes - e->last_inbytes)<<4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps)>>2; + s->ustats.inbps = (e->inbps+0xF)>>5; + + rate = (n_outbytes - e->last_outbytes)<<4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps)>>2; + s->ustats.outbps = (e->outbps+0xF)>>5; + spin_unlock(&s->lock); + } + spin_unlock(&est_lock); + mod_timer(&est_timer, jiffies + 2*HZ); +} + +void ip_vs_new_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + INIT_LIST_HEAD(&est->list); + + est->last_conns = stats->ustats.conns; + est->cps = stats->ustats.cps<<10; + + est->last_inpkts = stats->ustats.inpkts; + est->inpps = stats->ustats.inpps<<10; + + est->last_outpkts = stats->ustats.outpkts; + est->outpps = stats->ustats.outpps<<10; + + est->last_inbytes = stats->ustats.inbytes; + est->inbps = stats->ustats.inbps<<5; + + est->last_outbytes = stats->ustats.outbytes; + est->outbps = stats->ustats.outbps<<5; + + spin_lock_bh(&est_lock); + list_add(&est->list, &est_list); + spin_unlock_bh(&est_lock); +} + +void ip_vs_kill_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + spin_lock_bh(&est_lock); + list_del(&est->list); + spin_unlock_bh(&est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + /* set counters zero, caller must hold the stats->lock lock */ + est->last_inbytes = 0; + est->last_outbytes = 0; + est->last_conns = 0; + est->last_inpkts = 0; + est->last_outpkts = 0; + est->cps = 0; + est->inpps = 0; + est->outpps = 0; + est->inbps = 0; + est->outbps = 0; +} + +int __init ip_vs_estimator_init(void) +{ + mod_timer(&est_timer, jiffies + 2 * HZ); + return 0; +} + +void ip_vs_estimator_cleanup(void) +{ + del_timer_sync(&est_timer); +} diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c new file mode 100644 index 0000000..2e7dbd8 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -0,0 +1,410 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Authors: Wensong Zhang + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, ushort, NULL, 0); +MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __be32 *addr, __be16 *port, + char **start, char **end) +{ + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + *start = data + plen; + + for (data = *start; *data != term; data++) { + if (data == data_limit) + return -1; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = *start; data != *end; data++) { + if (*data >= '0' && *data <= '9') { + p[i] = p[i]*10 + *data - '0'; + } else if (*data == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *addr = get_unaligned((__be32 *)p); + *port = get_unaligned((__be16 *)(p + 4)); + return 1; +} + + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + union nf_inet_addr from; + __be16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int ret; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " + "%u.%u.%u.%u:%d detected\n", + NIPQUAD(from.ip), ntohs(port), + NIPQUAD(cp->caddr.ip), 0); + + /* + * Now update or create an connection entry for it + */ + n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, + &cp->caddr, 0); + if (!n_cp) { + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &cp->caddr, 0, + &cp->vaddr, port, + &from, port, + IP_VS_CONN_F_NO_CPORT, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from.ip = n_cp->vaddr.ip; + port = n_cp->vport; + sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), + (ntohs(port)>>8)&255, ntohs(port)&255); + buf_len = strlen(buf); + + /* + * Calculate required delta-offset to keep TCP happy + */ + *diff = buf_len - (end-start); + + if (*diff == 0) { + /* simply replace it with new passive address */ + memcpy(start, buf, buf_len); + ret = 1; + } else { + ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, + end-start, buf, buf_len); + } + + cp->app_data = NULL; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + union nf_inet_addr to; + __be16 port; + struct ip_vs_conn *n_cp; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(7, "got PASV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", + NIPQUAD(to.ip), ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); + + n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1)); + if (!n_cp) { + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1), + &cp->daddr, htons(ntohs(cp->dport)-1), + 0, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Move tunnel to listen state + */ + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + + +/* + * ip_vs_ftp initialization + */ +static int __init ip_vs_ftp_init(void) +{ + int i, ret; + struct ip_vs_app *app = &ip_vs_ftp; + + ret = register_ip_vs_app(app); + if (ret) + return ret; + + for (i=0; iprotocol, ports[i]); + if (ret) + break; + IP_VS_INFO("%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + + if (ret) + unregister_ip_vs_app(app); + + return ret; +} + + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_ip_vs_app(&ip_vs_ftp); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c new file mode 100644 index 0000000..6ecef35 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -0,0 +1,555 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include +#include +#include +#include + +/* for sysctl */ +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __be32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblc_expiration", + .data = &sysctl_ip_vs_lblc_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is refered either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static void +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash = ip_vs_lblc_hashkey(en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. Called under read + * lock + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) +{ + unsigned hash = ip_vs_lblc_hashkey(addr); + struct ip_vs_lblc_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under write lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + en->addr = daddr; + en->lastuse = jiffies; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + } else if (en->dest != dest) { + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + struct ip_vs_lblc_entry *en, *nxt; + int i; + + for (i=0; ibucket[i], list) { + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en, *nxt; + unsigned long now = jiffies; + int i, j; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + sysctl_ip_vs_lblc_expiration)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) + dest = en->dest; + } + read_unlock(&svc->sched_lock); + + /* If the destination has a weight and is not overloaded, use it */ + if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblc_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .schedule = ip_vs_lblc_schedule, +}; + + +static int __init ip_vs_lblc_init(void) +{ + int ret; + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; +} + + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c new file mode 100644 index 0000000..1f75ea8 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -0,0 +1,755 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include +#include +#include +#include +#include + +/* for sysctl */ +#include +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + e->next = set->list; + set->list = e; + atomic_inc(&set->size); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr.ip), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __be32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblcr_expiration", + .data = &sysctl_ip_vs_lblcr_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static void +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash = ip_vs_lblcr_hashkey(en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under + * read lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) +{ + unsigned hash = ip_vs_lblcr_hashkey(addr); + struct ip_vs_lblcr_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under write lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + en->addr = daddr; + en->lastuse = jiffies; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + rwlock_init(&en->set.lock); + + ip_vs_lblcr_hash(tbl, en); + } + + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct ip_vs_lblcr_entry *en, *nxt; + + /* No locking required, only called during cleanup. */ + for (i=0; ibucket[i], list) { + ip_vs_lblcr_free(en); + } + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, + now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* Get the least loaded destination */ + read_lock(&en->set.lock); + dest = ip_vs_dest_set_min(&en->set); + read_unlock(&en->set.lock); + + /* More than one destination + enough time passed by, cleanup */ + if (atomic_read(&en->set.size) > 1 && + time_after(jiffies, en->set.lastmod + + sysctl_ip_vs_lblcr_expiration)) { + struct ip_vs_dest *m; + + write_lock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + write_unlock(&en->set.lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) { + read_unlock(&svc->sched_lock); + goto out; + } + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + read_unlock(&svc->sched_lock); + return NULL; + } + + /* Update our cache entry */ + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + } + read_unlock(&svc->sched_lock); + + if (dest) + goto out; + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblcr_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .schedule = ip_vs_lblcr_schedule, +}; + + +static int __init ip_vs_lblcr_init(void) +{ + int ret; + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; +} + + +static void __exit ip_vs_lblcr_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c new file mode 100644 index 0000000..b69f808 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -0,0 +1,103 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_lc_dest_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (least) + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c new file mode 100644 index 0000000..9a2d803 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -0,0 +1,138 @@ +/* + * IPVS: Never Queue scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) + return NULL; + + out: + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c new file mode 100644 index 0000000..0791f9e --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -0,0 +1,288 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} + + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(int flags) +{ + struct ip_vs_protocol *pp; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { + if (pp->timeout_change) + pp->timeout_change(pp, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + return kmemdup(table, size, GFP_ATOMIC); +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; + return pp->state_name(state); +} + + +static void +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->frag_off & htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + __be16 _ports[2], *pptr +; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, + NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", + pp->name, + NIPQUAD(ih->saddr), + ntohs(pptr[0]), + NIPQUAD(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, + pp->name, + NIP6(ih->saddr), + NIP6(ih->daddr)); + else + sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", + pp->name, + NIP6(ih->saddr), + ntohs(pptr[0]), + NIP6(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == htons(ETH_P_IPV6)) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + + +int __init ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 0000000..80ab0c8 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,235 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->saddr, + htons(PORT_ISAKMP), + &iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->daddr, + htons(PORT_ISAKMP), + &iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->saddr, + htons(PORT_ISAKMP), + &iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->daddr, + htons(PORT_ISAKMP), + &iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + +static void +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == htons(ETH_P_IPV6)) + ah_esp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ah_esp_debug_packet_v4(pp, skb, offset, msg); +} + + +static void ah_esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 0000000..dd4566e --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,732 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include +#include + +#include + + +static struct ip_vs_conn * +tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + return ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } +} + +static struct ip_vs_conn * +tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + return ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } +} + + +static int +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + + if (th->syn && + (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, + th->dest))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); +} + + +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); +} + + +static int +tcp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static const int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t *tcp_state_table = tcp_states; + + +static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + tcp_state_table = (on? tcp_states_dos : tcp_states); +} + +static int +tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, + tcp_state_name_table, sname, to); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + cp->timeout = pp->timeout_table[cp->state = new_state]; +} + + +/* + * Handle state transitions + */ +static int +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + struct tcphdr _tcph, *th; + +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + spin_lock(&cp->lock); + set_tcp_state(pp, cp, direction, th); + spin_unlock(&cp->lock); + + return 1; +} + + +/* + * Hash table for TCP application incarnations + */ +#define TCP_APP_TAB_BITS 4 +#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) +#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + +static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(tcp_app_lock); + +static inline __u16 tcp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) + & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + + hash = tcp_app_hashkey(port); + + spin_lock_bh(&tcp_app_lock); + list_for_each_entry(i, &tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &tcp_apps[hash]); + atomic_inc(&ip_vs_protocol_tcp.appcnt); + + out: + spin_unlock_bh(&tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&tcp_app_lock); + atomic_dec(&ip_vs_protocol_tcp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + spin_lock(&tcp_app_lock); + list_for_each_entry(inc, &tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&tcp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +{ + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + spin_unlock(&cp->lock); +} + + +static void ip_vs_tcp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(tcp_apps); + pp->timeout_table = tcp_timeouts; +} + + +static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, + .dont_defrag = 0, + .appcnt = ATOMIC_INIT(0), + .init = ip_vs_tcp_init, + .exit = ip_vs_tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = tcp_conn_in_get, + .conn_out_get = tcp_conn_out_get, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, + .set_state_timeout = tcp_set_state_timeout, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c new file mode 100644 index 0000000..6eb6039 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,533 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct ip_vs_conn * +udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } + + return cp; +} + + +static struct ip_vs_conn * +udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } + + return cp; +} + + +static int +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + + svc = ip_vs_service_get(af, skb->mark, iph.protocol, + &iph.daddr, uh->dest); + if (svc) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + if (!uhdr->check) + uhdr->check = CSUM_MANGLED_0; +} + +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); +} + + +static int +udp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + } + return 1; +} + + +/* + * Note: the caller guarantees that only one of register_app, + * unregister_app or app_conn_bind is called each time. + */ + +#define UDP_APP_TAB_BITS 4 +#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) +#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + +static struct list_head udp_apps[UDP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(udp_app_lock); + +static inline __u16 udp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) + & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + + hash = udp_app_hashkey(port); + + + spin_lock_bh(&udp_app_lock); + list_for_each_entry(i, &udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &udp_apps[hash]); + atomic_inc(&ip_vs_protocol_udp.appcnt); + + out: + spin_unlock_bh(&udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&udp_app_lock); + atomic_dec(&ip_vs_protocol_udp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + spin_lock(&udp_app_lock); + list_for_each_entry(inc, &udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&udp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&udp_app_lock); + + out: + return result; +} + + +static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + + +static int +udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, + udp_state_name_table, sname, to); +} + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static int +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + return 1; +} + +static void udp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(udp_apps); + pp->timeout_table = udp_timeouts; +} + +static void udp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, + .dont_defrag = 0, + .init = udp_init, + .exit = udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = udp_conn_in_get, + .conn_out_get = udp_conn_out_get, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = udp_set_state_timeout, +}; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c new file mode 100644 index 0000000..a22195f --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -0,0 +1,112 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .init_service = ip_vs_rr_init_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c new file mode 100644 index 0000000..a46ad9e --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -0,0 +1,251 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_sched_lock); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + read_lock_bh(&__ip_vs_sched_lock); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + read_unlock_bh(&__ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler->module) + module_put(scheduler->module); +} + + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + write_lock_bh(&__ip_vs_sched_lock); + + if (!list_empty(&scheduler->n_list)) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already linked\n", scheduler->name); + return -EINVAL; + } + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", + scheduler->name); + return -EINVAL; + } + } + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + write_unlock_bh(&__ip_vs_sched_lock); + + IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + if (list_empty(&scheduler->n_list)) { + write_unlock_bh(&__ip_vs_sched_lock); + IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " + "is not in the list. failed\n", scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + write_unlock_bh(&__ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c new file mode 100644 index 0000000..7d2f22f --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -0,0 +1,140 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c new file mode 100644 index 0000000..1d96de2 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -0,0 +1,258 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include + +#include + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) +{ + return (tbl[ip_vs_sh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + struct iphdr *iph = ip_hdr(skb); + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(tbl, iph->saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->saddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c new file mode 100644 index 0000000..de5e7e1 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -0,0 +1,942 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_mc_join_group */ +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + + /* Flags and state transition */ + __be16 flags; /* status flags */ + __be16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +struct ip_vs_sync_thread_data { + struct socket *sock; + char *buf; +}; + +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +#define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ + +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* the maximum length of sync (sending/receiving) message */ +static int sync_send_mesg_maxlen; +static int sync_recv_mesg_maxlen; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + + +/* the sync_buff list head and the lock */ +static LIST_HEAD(ip_vs_sync_queue); +static DEFINE_SPINLOCK(ip_vs_sync_lock); + +/* current sync_buff for accepting new conn entries */ +static struct ip_vs_sync_buff *curr_sb = NULL; +static DEFINE_SPINLOCK(curr_sb_lock); + +/* ipvs sync daemon state */ +volatile int ip_vs_sync_state = IP_VS_STATE_NONE; +volatile int ip_vs_master_syncid = 0; +volatile int ip_vs_backup_syncid = 0; + +/* multicast interface name */ +char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + +/* sync daemon tasks */ +static struct task_struct *sync_master_thread; +static struct task_struct *sync_backup_thread; + +/* multicast addr */ +static struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = __constant_htons(IP_VS_SYNC_PORT), + .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), +}; + + +static inline struct ip_vs_sync_buff *sb_dequeue(void) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ip_vs_sync_lock); + if (list_empty(&ip_vs_sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ip_vs_sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ip_vs_sync_lock); + + return sb; +} + +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + sb->mesg->nr_conns = 0; + sb->mesg->syncid = ip_vs_master_syncid; + sb->mesg->size = 4; + sb->head = (unsigned char *)sb->mesg + 4; + sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ip_vs_sync_queue); + else + ip_vs_sync_buff_release(sb); + spin_unlock(&ip_vs_sync_lock); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&curr_sb_lock); + if (curr_sb && (time == 0 || + time_before(jiffies - curr_sb->firstuse, time))) { + sb = curr_sb; + curr_sb = NULL; + } else + sb = NULL; + spin_unlock_bh(&curr_sb_lock); + return sb; +} + + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + */ +void ip_vs_sync_conn(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg *m; + struct ip_vs_sync_conn *s; + int len; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create())) { + spin_unlock(&curr_sb_lock); + IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = curr_sb->mesg; + s = (struct ip_vs_sync_conn *)curr_sb->head; + + /* copy members */ + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_dest *dest; + char *p; + int i; + + if (buflen < sizeof(struct ip_vs_sync_mesg)) { + IP_VS_ERR_RL("sync message header too short\n"); + return; + } + + /* Convert size back to host byte order */ + m->size = ntohs(m->size); + + if (buflen != m->size) { + IP_VS_ERR_RL("bogus sync message size\n"); + return; + } + + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", + m->syncid); + return; + } + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + for (i=0; inr_conns; i++) { + unsigned flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("bogus conn in sync message\n"); + return; + } + s = (struct ip_vs_sync_conn *) p; + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("bogus conn options in sync message\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + pp = NULL; + if (state > 0) { + IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + state); + state = 0; + } + } + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); + else + cp = ip_vs_ct_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(AF_INET, + (union nf_inet_addr *)&s->daddr, + s->dport, + (union nf_inet_addr *)&s->vaddr, + s->vport, + s->protocol); + /* Set the approprite ativity flag */ + if (s->protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport, + (union nf_inet_addr *)&s->daddr, + s->dport, + flags, dest); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + IP_VS_ERR("ip_vs_conn_new failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = state; + cp->old_state = cp->state; + /* + * We can not recover the right timeout for templates + * in all cases, we can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + */ + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) + cp->timeout = pp->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + ip_vs_conn_put(cp); + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(int sync_state) +{ + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", sync_send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + return -ENODEV; + + sync_recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", sync_recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net_device *dev; + __be32 addr; + struct sockaddr_in sin; + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + IP_VS_ERR("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", + ifname, NIPQUAD(addr)); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket * make_send_sock(void) +{ + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + + result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error binding address of the mcast interface\n"); + goto error; + } + + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { + IP_VS_ERR("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return ERR_PTR(result); +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket * make_receive_sock(void) +{ + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { + IP_VS_ERR("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error joining to the multicast group\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return ERR_PTR(result); +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static void +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + + msize = msg->size; + + /* Put size in network byte order */ + msg->size = htons(msg->size); + + if (ip_vs_send_async(sock, (char *)msg, msize) != msize) + IP_VS_ERR("ip_vs_send_async error\n"); +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static int sync_thread_master(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct ip_vs_sync_buff *sb; + + IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_master_mcast_ifn, ip_vs_master_syncid); + + while (!kthread_should_stop()) { + while ((sb = sb_dequeue())) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in curr_sb for 2 seconds */ + sb = get_curr_sync_buff(2 * HZ); + if (sb) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + schedule_timeout_interruptible(HZ); + } + + /* clean up the sync_buff queue */ + while ((sb=sb_dequeue())) { + ip_vs_sync_buff_release(sb); + } + + /* clean up the current sync_buff */ + if ((sb = get_curr_sync_buff(0))) { + ip_vs_sync_buff_release(sb); + } + + /* release the sending multicast socket */ + sock_release(tinfo->sock); + kfree(tinfo); + + return 0; +} + + +static int sync_thread_backup(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + int len; + + IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + + while (!kthread_should_stop()) { + wait_event_interruptible(*tinfo->sock->sk->sk_sleep, + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + sync_recv_mesg_maxlen); + if (len <= 0) { + IP_VS_ERR("receiving message error\n"); + break; + } + + /* disable bottom half, because it accesses the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(tinfo->buf, len); + local_bh_enable(); + } + } + + /* release the sending multicast socket */ + sock_release(tinfo->sock); + kfree(tinfo->buf); + kfree(tinfo); + + return 0; +} + + +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +{ + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **realtask, *task; + struct socket *sock; + char *name, *buf = NULL; + int (*threadfn)(void *data); + int result = -ENOMEM; + + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn)); + + if (state == IP_VS_STATE_MASTER) { + if (sync_master_thread) + return -EEXIST; + + strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, + sizeof(ip_vs_master_mcast_ifn)); + ip_vs_master_syncid = syncid; + realtask = &sync_master_thread; + name = "ipvs_syncmaster"; + threadfn = sync_thread_master; + sock = make_send_sock(); + } else if (state == IP_VS_STATE_BACKUP) { + if (sync_backup_thread) + return -EEXIST; + + strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, + sizeof(ip_vs_backup_mcast_ifn)); + ip_vs_backup_syncid = syncid; + realtask = &sync_backup_thread; + name = "ipvs_syncbackup"; + threadfn = sync_thread_backup; + sock = make_receive_sock(); + } else { + return -EINVAL; + } + + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto out; + } + + set_sync_mesg_maxlen(state); + if (state == IP_VS_STATE_BACKUP) { + buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); + if (!buf) + goto outsocket; + } + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outbuf; + + tinfo->sock = sock; + tinfo->buf = buf; + + task = kthread_run(threadfn, tinfo, name); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + + /* mark as active */ + *realtask = task; + ip_vs_sync_state |= state; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + return 0; + +outtinfo: + kfree(tinfo); +outbuf: + kfree(buf); +outsocket: + sock_release(sock); +out: + return result; +} + + +int stop_sync_thread(int state) +{ + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + + if (state == IP_VS_STATE_MASTER) { + if (!sync_master_thread) + return -ESRCH; + + IP_VS_INFO("stopping master sync thread %d ...\n", + task_pid_nr(sync_master_thread)); + + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ + + spin_lock_bh(&ip_vs_sync_lock); + ip_vs_sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock_bh(&ip_vs_sync_lock); + kthread_stop(sync_master_thread); + sync_master_thread = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!sync_backup_thread) + return -ESRCH; + + IP_VS_INFO("stopping backup sync thread %d ...\n", + task_pid_nr(sync_backup_thread)); + + ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; + kthread_stop(sync_backup_thread); + sync_backup_thread = NULL; + } else { + return -EINVAL; + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c new file mode 100644 index 0000000..8c596e7 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -0,0 +1,128 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_wlc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_wlc_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c new file mode 100644 index 0000000..7ea92fe --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -0,0 +1,237 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include +#include +#include + +#include + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +/* + * Get the gcd of server weights + */ +static int gcd(int a, int b) +{ + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (atomic_read(&dest->weight) > weight) + weight = atomic_read(&dest->weight); + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + if (mark->cw > mark->mw) + mark->cw = 0; + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no available servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + IP_VS_ERR_RL("ip_vs_wrr_schedule(): " + "no available servers\n"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p && mark->cw == mark->di) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + goto out; + } + } + + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c new file mode 100644 index 0000000..02ddc2b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -0,0 +1,1004 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include /* for icmp_send */ +#include /* for ip_route_output */ +#include +#include +#include +#include +#include + +#include + + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete + || (dest->af == AF_INET && rtos != dest->dst_rtos)) && + dst->ops->check(dst, cookie) == NULL) { + dest->dst_cache = NULL; + dst_release(dst); + return NULL; + } + dst_hold(dst); + return dst; +} + +static struct rtable * +__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +{ + struct rtable *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos, 0))) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr.ip, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&init_net, &rt, &fl)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, " + "dest: %u.%u.%u.%u\n", + NIPQUAD(dest->addr.ip)); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", + NIPQUAD(dest->addr.ip), + atomic_read(&rt->u.dst.__refcnt), rtos); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = cp->daddr.ip, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&init_net, &rt, &fl)) { + IP_VS_DBG_RL("ip_route_output error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); + return NULL; + } + } + + return rt; +} + +#ifdef CONFIG_IP_VS_IPV6 +static struct rt6_info * +__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); + if (!rt) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = dest->addr.in6, + .saddr = { + .s6_addr32 = + { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, + NULL, &fl); + if (!rt) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip6_route_output error, " + "dest: " NIP6_FMT "\n", + NIP6(dest->addr.in6)); + return NULL; + } + __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", + NIP6(dest->addr.in6), + atomic_read(&rt->u.dst.__refcnt)); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = cp->daddr.in6, + .saddr = { + .s6_addr32 = { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip6_route_output error, dest: " + NIP6_FMT "\n", NIP6(cp->daddr.in6)); + return NULL; + } + } + + return rt; +} +#endif + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +#define IP_VS_XMIT(pf, skb, rt) \ +do { \ + (skb)->ipvs_property = 1; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + (rt)->u.dst.dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + /* we do not touch skb and do not need pskb ptr */ + return NF_ACCEPT; +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + u8 tos = iph->tos; + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = 0, + .tos = RT_TOS(tos), } }, + }; + + EnterFunction(10); + + if (ip_route_output_key(&init_net, &rt, &fl)) { + IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " + "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ipv6hdr *iph = ipv6_hdr(skb); + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = iph->daddr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + EnterFunction(10); + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " + "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + struct iphdr *iph = ip_hdr(skb); + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct iphdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ip_hdr(skb)->daddr = cp->daddr.ip; + ip_send_check(ip_hdr(skb)); + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, sizeof(struct ipv6hdr), + sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "ip_vs_nat_xmit_v6(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = ip_hdr(skb); + u8 tos = old_iph->tos; + __be16 df = old_iph->frag_off; + sk_buff_data_t old_transport_header = skb->transport_header; + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IP)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " + "ETH_P_IP: %d, skb protocol: %d\n", + htons(ETH_P_IP), skb->protocol); + goto tx_error; + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off & htons(IP_DF)); + + if ((old_iph->frag_off & htons(IP_DF)) + && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb->transport_header = old_transport_header; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + ip_select_ident(iph, &rt->u.dst, NULL); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + sk_buff_data_t old_transport_header = skb->transport_header; + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IPV6)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " + "ETH_P_IPV6: %d, skb protocol: %d\n", + htons(ETH_P_IPV6), skb->protocol); + goto tx_error; + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); + /* TODO IPv6: do we need this check in IPv6? */ + if (mtu < 1280) { + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + dst_release(&rt->u.dst); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = old_transport_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len + sizeof(old_iph); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + iph->daddr = rt->rt6i_dst.addr; + iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip6_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + rc = NF_STOLEN; + goto out; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif -- cgit v0.10.2