From 55820ee2f8c767a2833b21bd365e5753f50bd8ce Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 5 Jul 2005 14:08:10 -0700 Subject: [NET]: Fix signedness issues in net/core/filter.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the code to load packet data into a register: k = fentry->k; if (k < 0) { ... } else { u32 _tmp, *p; p = skb_header_pointer(skb, k, 4, &_tmp); if (p != NULL) { A = ntohl(*p); continue; } } skb_header_pointer checks if the requested data is within the linear area: int hlen = skb_headlen(skb); if (offset + len <= hlen) return skb->data + offset; When offset is within [INT_MAX-len+1..INT_MAX] the addition will result in a negative number which is <= hlen. I couldn't trigger a crash on my AMD64 with 2GB of memory, but a coworker tried on his x86 machine and it crashed immediately. This patch fixes the check in skb_header_pointer to handle large positive offsets similar to skb_copy_bits. Invalid data can still be accessed using negative offsets (also similar to skb_copy_bits), anyone using negative offsets needs to verify them himself. Thanks to Thomas Vögtle for verifying the problem by crashing his machine and providing me with an Oops. Signed-off-by: Patrick McHardy Acked-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 416a2e4..fbcb186 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1211,7 +1211,7 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, { int hlen = skb_headlen(skb); - if (offset + len <= hlen) + if (hlen - offset >= len) return skb->data + offset; if (skb_copy_bits(skb, offset, buffer, len) < 0) -- cgit v0.10.2 From 6935d46c2da64aa032a557374c95336e265cd7ef Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 5 Jul 2005 14:08:57 -0700 Subject: [NET]: Remove redundant code in net/core/filter.c skb_header_pointer handles linear and non-linear data, no need to handle linear data again. Signed-off-by: Patrick McHardy Acked-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/core/filter.c b/net/core/filter.c index f3b8820..e1267b4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -168,10 +168,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) case BPF_LD|BPF_W|BPF_ABS: k = fentry->k; load_w: - if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { - A = ntohl(*(u32*)&data[k]); - continue; - } if (k < 0) { u8 *ptr; @@ -194,10 +190,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) case BPF_LD|BPF_H|BPF_ABS: k = fentry->k; load_h: - if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { - A = ntohs(*(u16*)&data[k]); - continue; - } if (k < 0) { u8 *ptr; @@ -220,10 +212,6 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) case BPF_LD|BPF_B|BPF_ABS: k = fentry->k; load_b: - if (k >= 0 && (unsigned int)k < len) { - A = data[k]; - continue; - } if (k < 0) { u8 *ptr; -- cgit v0.10.2 From 0b05b2a49e430220876f8faa7e4778dc7497033c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 5 Jul 2005 14:10:21 -0700 Subject: [NET]: Consolidate common code in net/core/filter.c Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/core/filter.c b/net/core/filter.c index e1267b4..3923428 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -36,7 +36,7 @@ #include /* No hurry in this branch */ -static u8 *load_pointer(struct sk_buff *skb, int k) +static void *__load_pointer(struct sk_buff *skb, int k) { u8 *ptr = NULL; @@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k) return NULL; } +static inline void *load_pointer(struct sk_buff *skb, int k, + unsigned int size, void *buffer) +{ + if (k >= 0) + return skb_header_pointer(skb, k, size, buffer); + else { + if (k >= SKF_AD_OFF) + return NULL; + return __load_pointer(skb, k); + } +} + /** * sk_run_filter - run a filter on a socket * @skb: buffer to run the filter on @@ -64,15 +76,16 @@ static u8 *load_pointer(struct sk_buff *skb, int k) int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { - unsigned char *data = skb->data; /* len is UNSIGNED. Byte wide insns relies only on implicit type casts to prevent reading arbitrary memory locations. */ unsigned int len = skb->len-skb->data_len; struct sock_filter *fentry; /* We walk down these */ + void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + u32 tmp; int k; int pc; @@ -168,67 +181,28 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) case BPF_LD|BPF_W|BPF_ABS: k = fentry->k; load_w: - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = ntohl(*(u32*)ptr); - continue; - } - } else { - u32 _tmp, *p; - p = skb_header_pointer(skb, k, 4, &_tmp); - if (p != NULL) { - A = ntohl(*p); - continue; - } + ptr = load_pointer(skb, k, 4, &tmp); + if (ptr != NULL) { + A = ntohl(*(u32 *)ptr); + continue; } return 0; case BPF_LD|BPF_H|BPF_ABS: k = fentry->k; load_h: - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = ntohs(*(u16*)ptr); - continue; - } - } else { - u16 _tmp, *p; - p = skb_header_pointer(skb, k, 2, &_tmp); - if (p != NULL) { - A = ntohs(*p); - continue; - } + ptr = load_pointer(skb, k, 2, &tmp); + if (ptr != NULL) { + A = ntohs(*(u16 *)ptr); + continue; } return 0; case BPF_LD|BPF_B|BPF_ABS: k = fentry->k; load_b: - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = *ptr; - continue; - } - } else { - u8 _tmp, *p; - p = skb_header_pointer(skb, k, 1, &_tmp); - if (p != NULL) { - A = *p; - continue; - } + ptr = load_pointer(skb, k, 1, &tmp); + if (ptr != NULL) { + A = *(u8 *)ptr; + continue; } return 0; case BPF_LD|BPF_W|BPF_LEN: @@ -247,10 +221,12 @@ load_b: k = X + fentry->k; goto load_b; case BPF_LDX|BPF_B|BPF_MSH: - if (fentry->k >= len) - return 0; - X = (data[fentry->k] & 0xf) << 2; - continue; + ptr = load_pointer(skb, fentry->k, 1, &tmp); + if (ptr != NULL) { + X = (*(u8 *)ptr & 0xf) << 2; + continue; + } + return 0; case BPF_LD|BPF_IMM: A = fentry->k; continue; -- cgit v0.10.2 From 3154e540e374bbfd62693d95bc8ed51da95efe75 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 5 Jul 2005 14:10:40 -0700 Subject: [NET]: net/core/filter.c: make len cover the entire packet As suggested by Herbert Xu: Since we don't require anything to be in the linear packet range anymore make len cover the entire packet. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/core/filter.c b/net/core/filter.c index 3923428..cd91a24 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -76,10 +76,6 @@ static inline void *load_pointer(struct sk_buff *skb, int k, int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { - /* len is UNSIGNED. Byte wide insns relies only on implicit - type casts to prevent reading arbitrary memory locations. - */ - unsigned int len = skb->len-skb->data_len; struct sock_filter *fentry; /* We walk down these */ void *ptr; u32 A = 0; /* Accumulator */ @@ -206,10 +202,10 @@ load_b: } return 0; case BPF_LD|BPF_W|BPF_LEN: - A = len; + A = skb->len; continue; case BPF_LDX|BPF_W|BPF_LEN: - X = len; + X = skb->len; continue; case BPF_LD|BPF_W|BPF_IND: k = X + fentry->k; -- cgit v0.10.2 From e176fe8954a5239c24afe79b1001ba3c29511963 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 5 Jul 2005 14:12:44 -0700 Subject: [NET]: Remove unused security member in sk_buff Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fbcb186..1e6290f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -183,7 +183,6 @@ struct skb_shared_info { * @priority: Packet queueing priority * @users: User count - see {datagram,tcp}.c * @protocol: Packet protocol from driver - * @security: Security level of packet * @truesize: Buffer size * @head: Head of buffer * @data: Data head pointer @@ -255,8 +254,7 @@ struct sk_buff { pkt_type, ip_summed; __u32 priority; - unsigned short protocol, - security; + unsigned short protocol; void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h index a6b2cc5..bcb762d 100644 --- a/include/linux/tc_ematch/tc_em_meta.h +++ b/include/linux/tc_ematch/tc_em_meta.h @@ -45,7 +45,7 @@ enum TCF_META_ID_REALDEV, TCF_META_ID_PRIORITY, TCF_META_ID_PROTOCOL, - TCF_META_ID_SECURITY, + TCF_META_ID_SECURITY, /* obsolete */ TCF_META_ID_PKTTYPE, TCF_META_ID_PKTLEN, TCF_META_ID_DATALEN, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb73b21..733deee 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) C(ip_summed); C(priority); C(protocol); - C(security); n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); @@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->pkt_type = old->pkt_type; new->stamp = old->stamp; new->destructor = NULL; - new->security = old->security; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; new->nfcache = old->nfcache; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6ce5c32..1bfa49e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 06e7cdae..1f2c2f9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 48bb23c..53d98f8 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol) dst->value = skb->protocol; } -META_COLLECTOR(int_security) -{ - dst->value = skb->security; -} - META_COLLECTOR(int_pkttype) { dst->value = skb->pkt_type; @@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(REALDEV)] = META_FUNC(int_realdev), [META_ID(PRIORITY)] = META_FUNC(int_priority), [META_ID(PROTOCOL)] = META_FUNC(int_protocol), - [META_ID(SECURITY)] = META_FUNC(int_security), [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), [META_ID(PKTLEN)] = META_FUNC(int_pktlen), [META_ID(DATALEN)] = META_FUNC(int_datalen), -- cgit v0.10.2 From 1cbb3380ef683f742876f48e3739b3df4ea9e168 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 5 Jul 2005 14:13:41 -0700 Subject: [NET]: Reduce size of sk_buff by 4 bytes Reduce local_df to a bit field and ip_summed to a 2 bits field thus saving 13 bits. Move bit fields, packet type, and protocol into the spare area between the priority and the destructor. Saves 4 bytes on both, 32bit and 64bit architectures. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1e6290f..14b9504 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -248,17 +248,18 @@ struct sk_buff { data_len, mac_len, csum; - unsigned char local_df, - cloned:1, - nohdr:1, - pkt_type, - ip_summed; __u32 priority; - unsigned short protocol; + __u8 local_df:1, + cloned:1, + ip_summed:2, + nohdr:1; + /* 3 bits spare */ + __u8 pkt_type; + __u16 protocol; void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER - unsigned long nfmark; + unsigned long nfmark; __u32 nfcache; __u32 nfctinfo; struct nf_conntrack *nfct; -- cgit v0.10.2 From e41a33e6ec20a0a6ac762629149e36cab5d4213f Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 5 Jul 2005 14:14:30 -0700 Subject: [PKT_SCHED]: Move sch_generic.c prototypes to correct header file Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index fcb05a3..2f494a2 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -207,8 +207,6 @@ psched_tod_diff(int delta_sec, int bound) #endif /* !CONFIG_NET_SCH_CLK_GETTIMEOFDAY */ -extern struct Qdisc noop_qdisc; -extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; @@ -216,14 +214,6 @@ extern int register_qdisc(struct Qdisc_ops *qops); extern int unregister_qdisc(struct Qdisc_ops *qops); extern struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); extern struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); -extern void dev_init_scheduler(struct net_device *dev); -extern void dev_shutdown(struct net_device *dev); -extern void dev_activate(struct net_device *dev); -extern void dev_deactivate(struct net_device *dev); -extern void qdisc_reset(struct Qdisc *qdisc); -extern void qdisc_destroy(struct Qdisc *qdisc); -extern struct Qdisc * qdisc_create_dflt(struct net_device *dev, - struct Qdisc_ops *ops); extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab); extern void qdisc_put_rtab(struct qdisc_rate_table *tab); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7b97405..c76d34e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -164,6 +164,18 @@ extern void qdisc_unlock_tree(struct net_device *dev); #define tcf_tree_lock(tp) qdisc_lock_tree((tp)->q->dev) #define tcf_tree_unlock(tp) qdisc_unlock_tree((tp)->q->dev) +extern struct Qdisc noop_qdisc; +extern struct Qdisc_ops noop_qdisc_ops; + +extern void dev_init_scheduler(struct net_device *dev); +extern void dev_shutdown(struct net_device *dev); +extern void dev_activate(struct net_device *dev); +extern void dev_deactivate(struct net_device *dev); +extern void qdisc_reset(struct Qdisc *qdisc); +extern void qdisc_destroy(struct Qdisc *qdisc); +extern struct Qdisc *qdisc_create_dflt(struct net_device *dev, + struct Qdisc_ops *ops); + static inline void tcf_destroy(struct tcf_proto *tp) { -- cgit v0.10.2 From 3d54b82fdf0ca79608f61448fb8ab92676487645 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 5 Jul 2005 14:15:09 -0700 Subject: [PKT_SCHED]: Cleanup qdisc creation and alignment macros Adds qdisc_alloc() to share code between qdisc_create() and qdisc_create_dflt(). Hides the qdisc alignment behind macros and makes use of them. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2f494a2..6492e73 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -13,13 +13,12 @@ struct qdisc_walker extern rwlock_t qdisc_tree_lock; -#define QDISC_ALIGN 32 -#define QDISC_ALIGN_CONST (QDISC_ALIGN - 1) +#define QDISC_ALIGNTO 32 +#define QDISC_ALIGN(len) (((len) + QDISC_ALIGNTO-1) & ~(QDISC_ALIGNTO-1)) static inline void *qdisc_priv(struct Qdisc *q) { - return (char *)q + ((sizeof(struct Qdisc) + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); + return (char *) q + QDISC_ALIGN(sizeof(struct Qdisc)); } /* diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c76d34e..7b6ec99 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -173,6 +173,7 @@ extern void dev_activate(struct net_device *dev); extern void dev_deactivate(struct net_device *dev); extern void qdisc_reset(struct Qdisc *qdisc); extern void qdisc_destroy(struct Qdisc *qdisc); +extern struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops); extern struct Qdisc *qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 05e6e0a..1ef482b 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) { int err; struct rtattr *kind = tca[TCA_KIND-1]; - void *p = NULL; struct Qdisc *sch; struct Qdisc_ops *ops; - int size; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_KMOD @@ -437,43 +435,23 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) if (ops == NULL) goto err_out; - /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); - size += ops->priv_size + QDISC_ALIGN_CONST; - - p = kmalloc(size, GFP_KERNEL); - err = -ENOBUFS; - if (!p) + sch = qdisc_alloc(dev, ops); + if (IS_ERR(sch)) { + err = PTR_ERR(sch); goto err_out2; - memset(p, 0, size); - sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); - sch->padded = (char *)sch - (char *)p; - - INIT_LIST_HEAD(&sch->list); - skb_queue_head_init(&sch->q); + } - if (handle == TC_H_INGRESS) + if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; - - sch->ops = ops; - sch->enqueue = ops->enqueue; - sch->dequeue = ops->dequeue; - sch->dev = dev; - dev_hold(dev); - atomic_set(&sch->refcnt, 1); - sch->stats_lock = &dev->queue_lock; - if (handle == 0) { + handle = TC_H_MAKE(TC_H_INGRESS, 0); + } else if (handle == 0) { handle = qdisc_alloc_handle(dev); err = -ENOMEM; if (handle == 0) goto err_out3; } - if (handle == TC_H_INGRESS) - sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); - else - sch->handle = handle; + sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { qdisc_lock_tree(dev); @@ -489,12 +467,11 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) } err_out3: dev_put(dev); + kfree((char *) sch - sch->padded); err_out2: module_put(ops->owner); err_out: *errp = err; - if (p) - kfree(p); return NULL; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7683b34..73e218e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = { .owner = THIS_MODULE, }; -struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; - int size; + unsigned int size; + int err = -ENOBUFS; /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); - size += ops->priv_size + QDISC_ALIGN_CONST; + size = QDISC_ALIGN(sizeof(*sch)); + size += ops->priv_size + (QDISC_ALIGNTO - 1); p = kmalloc(size, GFP_KERNEL); if (!p) - return NULL; + goto errout; memset(p, 0, size); - - sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); - sch->padded = (char *)sch - (char *)p; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + sch->padded = (char *) sch - (char *) p; INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); @@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) dev_hold(dev); sch->stats_lock = &dev->queue_lock; atomic_set(&sch->refcnt, 1); + + return sch; +errout: + return ERR_PTR(-err); +} + +struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +{ + struct Qdisc *sch; + + sch = qdisc_alloc(dev, ops); + if (IS_ERR(sch)) + goto errout; + if (!ops->init || ops->init(sch, NULL) == 0) return sch; - dev_put(dev); - kfree(p); +errout: return NULL; } @@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up); EXPORT_SYMBOL(noop_qdisc); EXPORT_SYMBOL(noop_qdisc_ops); EXPORT_SYMBOL(qdisc_create_dflt); +EXPORT_SYMBOL(qdisc_alloc); EXPORT_SYMBOL(qdisc_destroy); EXPORT_SYMBOL(qdisc_reset); EXPORT_SYMBOL(qdisc_restart); -- cgit v0.10.2 From 023e09a767a89bf1b8646307410852d93fd72f00 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 5 Jul 2005 14:15:53 -0700 Subject: [PKT_SCHED]: Report rate estimator configuration errors during qdisc allocation Current behaviour is to not report an error if a rate estimator is created together with a qdisc and the configuration of the rate estimator is bogus. This leads to unexpected behaviour because the user is not notified. New behaviour is to report the error and let the whole qdisc creation operation fail so the user is able to fix his mistake. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1ef482b..b9a069a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -454,15 +454,27 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + err = gen_new_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, + tca[TCA_RATE-1]); + if (err) { + /* + * Any broken qdiscs that would require + * a ops->reset() here? The qdisc was never + * in action so it shouldn't be necessary. + */ + if (ops->destroy) + ops->destroy(sch); + goto err_out3; + } + } +#endif qdisc_lock_tree(dev); list_add_tail(&sch->list, &dev->qdisc_list); qdisc_unlock_tree(dev); -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) - gen_new_estimator(&sch->bstats, &sch->rate_est, - sch->stats_lock, tca[TCA_RATE-1]); -#endif return sch; } err_out3: -- cgit v0.10.2 From a31488ca4b8476a8dd301b21388631df52d05c5a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 14:24:35 -0700 Subject: [SKGE]: Fix build on big-endian Missing PCI_REV_DESC define. Signed-off-by: David S. Miller diff --git a/drivers/net/skge.h b/drivers/net/skge.h index 14d0cc0..fced3d2 100644 --- a/drivers/net/skge.h +++ b/drivers/net/skge.h @@ -7,6 +7,7 @@ /* PCI config registers */ #define PCI_DEV_REG1 0x40 #define PCI_DEV_REG2 0x44 +#define PCI_REV_DESC 0x4 #define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \ PCI_STATUS_SIG_SYSTEM_ERROR | \ -- cgit v0.10.2 From 30e224d76f34e041c30df66a4dcbeeb53556ea3f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 5 Jul 2005 14:40:10 -0700 Subject: [IPV4]: Fix crash in ip_rcv while booting related to netconsole Makes IPv4 ip_rcv registration happen last in af_inet. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 658e797..ef74683 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void) static int ipv4_proc_init(void); extern void ipfrag_init(void); +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type = { + .type = __constant_htons(ETH_P_IP), + .func = ip_rcv, +}; + static int __init inet_init(void) { struct sk_buff *dummy_skb; @@ -1102,6 +1111,8 @@ static int __init inet_init(void) ipfrag_init(); + dev_add_pack(&ip_packet_type); + rc = 0; out: return rc; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1bfa49e..9de83e6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1328,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar ip_rt_put(rt); } -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = ip_rcv, -}; - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - void __init ip_init(void) { - dev_add_pack(&ip_packet_type); - ip_rt_init(); inet_initpeers(); -- cgit v0.10.2 From e2ed4052aa662e7cfb22a1793b9d8158603be6d7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 5 Jul 2005 14:41:20 -0700 Subject: [IPV6]: Makes IPv6 rcv registration happen last during initialisation. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2b193e3..28d9bca 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -774,7 +774,6 @@ static int __init inet6_init(void) if (if6_proc_init()) goto proc_if6_fail; #endif - ipv6_packet_init(); ip6_route_init(); ip6_flowlabel_init(); err = addrconf_init(); @@ -791,6 +790,8 @@ static int __init inet6_init(void) /* Init v6 transport protocols. */ udpv6_init(); tcpv6_init(); + + ipv6_packet_init(); err = 0; out: return err; @@ -798,7 +799,6 @@ out: addrconf_fail: ip6_flowlabel_cleanup(); ip6_route_cleanup(); - ipv6_packet_cleanup(); #ifdef CONFIG_PROC_FS if6_proc_exit(); proc_if6_fail: -- cgit v0.10.2 From d244c892c8e23d6baba88af88f78f7201a224d39 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 5 Jul 2005 14:42:33 -0700 Subject: [TG3]: support for ethtool -C Add support for ethtool -C with verification of user parameters. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 7e371b1..7f84dc8 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -5117,7 +5117,7 @@ static void tg3_set_bdinfo(struct tg3 *tp, u32 bdinfo_addr, } static void __tg3_set_rx_mode(struct net_device *); -static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec) +static void __tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec) { tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs); tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs); @@ -5460,7 +5460,7 @@ static int tg3_reset_hw(struct tg3 *tp) udelay(10); } - tg3_set_coalesce(tp, &tp->coal); + __tg3_set_coalesce(tp, &tp->coal); /* set status block DMA address */ tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH, @@ -7821,6 +7821,60 @@ static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) return 0; } +static int tg3_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) +{ + struct tg3 *tp = netdev_priv(dev); + u32 max_rxcoal_tick_int = 0, max_txcoal_tick_int = 0; + u32 max_stat_coal_ticks = 0, min_stat_coal_ticks = 0; + + if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { + max_rxcoal_tick_int = MAX_RXCOAL_TICK_INT; + max_txcoal_tick_int = MAX_TXCOAL_TICK_INT; + max_stat_coal_ticks = MAX_STAT_COAL_TICKS; + min_stat_coal_ticks = MIN_STAT_COAL_TICKS; + } + + if ((ec->rx_coalesce_usecs > MAX_RXCOL_TICKS) || + (ec->tx_coalesce_usecs > MAX_TXCOL_TICKS) || + (ec->rx_max_coalesced_frames > MAX_RXMAX_FRAMES) || + (ec->tx_max_coalesced_frames > MAX_TXMAX_FRAMES) || + (ec->rx_coalesce_usecs_irq > max_rxcoal_tick_int) || + (ec->tx_coalesce_usecs_irq > max_txcoal_tick_int) || + (ec->rx_max_coalesced_frames_irq > MAX_RXCOAL_MAXF_INT) || + (ec->tx_max_coalesced_frames_irq > MAX_TXCOAL_MAXF_INT) || + (ec->stats_block_coalesce_usecs > max_stat_coal_ticks) || + (ec->stats_block_coalesce_usecs < min_stat_coal_ticks)) + return -EINVAL; + + /* No rx interrupts will be generated if both are zero */ + if ((ec->rx_coalesce_usecs == 0) && + (ec->rx_max_coalesced_frames == 0)) + return -EINVAL; + + /* No tx interrupts will be generated if both are zero */ + if ((ec->tx_coalesce_usecs == 0) && + (ec->tx_max_coalesced_frames == 0)) + return -EINVAL; + + /* Only copy relevant parameters, ignore all others. */ + tp->coal.rx_coalesce_usecs = ec->rx_coalesce_usecs; + tp->coal.tx_coalesce_usecs = ec->tx_coalesce_usecs; + tp->coal.rx_max_coalesced_frames = ec->rx_max_coalesced_frames; + tp->coal.tx_max_coalesced_frames = ec->tx_max_coalesced_frames; + tp->coal.rx_coalesce_usecs_irq = ec->rx_coalesce_usecs_irq; + tp->coal.tx_coalesce_usecs_irq = ec->tx_coalesce_usecs_irq; + tp->coal.rx_max_coalesced_frames_irq = ec->rx_max_coalesced_frames_irq; + tp->coal.tx_max_coalesced_frames_irq = ec->tx_max_coalesced_frames_irq; + tp->coal.stats_block_coalesce_usecs = ec->stats_block_coalesce_usecs; + + if (netif_running(dev)) { + tg3_full_lock(tp, 0); + __tg3_set_coalesce(tp, &tp->coal); + tg3_full_unlock(tp); + } + return 0; +} + static struct ethtool_ops tg3_ethtool_ops = { .get_settings = tg3_get_settings, .set_settings = tg3_set_settings, @@ -7856,6 +7910,7 @@ static struct ethtool_ops tg3_ethtool_ops = { .get_stats_count = tg3_get_stats_count, .get_ethtool_stats = tg3_get_ethtool_stats, .get_coalesce = tg3_get_coalesce, + .set_coalesce = tg3_set_coalesce, }; static void __devinit tg3_get_eeprom_size(struct tg3 *tp) @@ -9800,6 +9855,12 @@ static void __devinit tg3_init_coal(struct tg3 *tp) ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS; ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS; } + + if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) { + ec->rx_coalesce_usecs_irq = 0; + ec->tx_coalesce_usecs_irq = 0; + ec->stats_block_coalesce_usecs = 0; + } } static int __devinit tg3_init_one(struct pci_dev *pdev, diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 99c5f96..70ad450 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -879,31 +879,41 @@ #define LOW_RXCOL_TICKS_CLRTCKS 0x00000014 #define DEFAULT_RXCOL_TICKS 0x00000048 #define HIGH_RXCOL_TICKS 0x00000096 +#define MAX_RXCOL_TICKS 0x000003ff #define HOSTCC_TXCOL_TICKS 0x00003c0c #define LOW_TXCOL_TICKS 0x00000096 #define LOW_TXCOL_TICKS_CLRTCKS 0x00000048 #define DEFAULT_TXCOL_TICKS 0x0000012c #define HIGH_TXCOL_TICKS 0x00000145 +#define MAX_TXCOL_TICKS 0x000003ff #define HOSTCC_RXMAX_FRAMES 0x00003c10 #define LOW_RXMAX_FRAMES 0x00000005 #define DEFAULT_RXMAX_FRAMES 0x00000008 #define HIGH_RXMAX_FRAMES 0x00000012 +#define MAX_RXMAX_FRAMES 0x000000ff #define HOSTCC_TXMAX_FRAMES 0x00003c14 #define LOW_TXMAX_FRAMES 0x00000035 #define DEFAULT_TXMAX_FRAMES 0x0000004b #define HIGH_TXMAX_FRAMES 0x00000052 +#define MAX_TXMAX_FRAMES 0x000000ff #define HOSTCC_RXCOAL_TICK_INT 0x00003c18 #define DEFAULT_RXCOAL_TICK_INT 0x00000019 #define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014 +#define MAX_RXCOAL_TICK_INT 0x000003ff #define HOSTCC_TXCOAL_TICK_INT 0x00003c1c #define DEFAULT_TXCOAL_TICK_INT 0x00000019 #define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014 +#define MAX_TXCOAL_TICK_INT 0x000003ff #define HOSTCC_RXCOAL_MAXF_INT 0x00003c20 #define DEFAULT_RXCOAL_MAXF_INT 0x00000005 +#define MAX_RXCOAL_MAXF_INT 0x000000ff #define HOSTCC_TXCOAL_MAXF_INT 0x00003c24 #define DEFAULT_TXCOAL_MAXF_INT 0x00000005 +#define MAX_TXCOAL_MAXF_INT 0x000000ff #define HOSTCC_STAT_COAL_TICKS 0x00003c28 #define DEFAULT_STAT_COAL_TICKS 0x000f4240 +#define MAX_STAT_COAL_TICKS 0xd693d400 +#define MIN_STAT_COAL_TICKS 0x00000064 /* 0x3c2c --> 0x3c30 unused */ #define HOSTCC_STATS_BLK_HOST_ADDR 0x00003c30 /* 64-bit */ #define HOSTCC_STATUS_BLK_HOST_ADDR 0x00003c38 /* 64-bit */ -- cgit v0.10.2 From 93e266f600f4048fe7a2e8803abb9f8baff84aa7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 14:43:19 -0700 Subject: [TG3]: Update driver version and reldate. Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 7f84dc8..5464068 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -66,8 +66,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.32" -#define DRV_MODULE_RELDATE "June 24, 2005" +#define DRV_MODULE_VERSION "3.33" +#define DRV_MODULE_RELDATE "July 5, 2005" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 -- cgit v0.10.2 From f0e36f8cee8101604378085171c980d9cc71d779 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 5 Jul 2005 14:44:55 -0700 Subject: [IPV4]: Handle large allocations in fib_trie Inflating a node a couple of times makes it exceed the 128k kmalloc limit. Use __get_free_pages for allocations > PAGE_SIZE, as in fib_hash. Signed-off-by: Patrick McHardy Acked-by: Robert Olsson Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b56e88e..9038b91 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -358,11 +358,32 @@ static inline void free_leaf_info(struct leaf_info *li) kfree(li); } +static struct tnode *tnode_alloc(unsigned int size) +{ + if (size <= PAGE_SIZE) { + return kmalloc(size, GFP_KERNEL); + } else { + return (struct tnode *) + __get_free_pages(GFP_KERNEL, get_order(size)); + } +} + +static void __tnode_free(struct tnode *tn) +{ + unsigned int size = sizeof(struct tnode) + + (1<bits) * sizeof(struct node *); + + if (size <= PAGE_SIZE) + kfree(tn); + else + free_pages((unsigned long)tn, get_order(size)); +} + static struct tnode* tnode_new(t_key key, int pos, int bits) { int nchildren = 1< 0 ) printk("FT %p \n", tn); } -- cgit v0.10.2 From 22c047ccbc68fa8f3fa57f0e8f906479a062c426 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Jul 2005 14:55:24 -0700 Subject: [NET]: Hashed spinlocks in net/ipv4/route.c - Locking abstraction - Spinlocks moved out of rt hash table : Less memory (50%) used by rt hash table. it's a win even on UP. - Sizing of spinlocks table depends on NR_CPUS Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 12a1cf3..daf82f8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -54,6 +54,7 @@ * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file + * Eric Dumazet : hashed spinlocks * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -201,8 +202,37 @@ __u8 ip_tos2prio[16] = { struct rt_hash_bucket { struct rtable *chain; - spinlock_t lock; -} __attribute__((__aligned__(8))); +}; +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +/* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks + * The size of this table is a power of two and depends on the number of CPUS. + */ +#if NR_CPUS >= 32 +#define RT_HASH_LOCK_SZ 4096 +#elif NR_CPUS >= 16 +#define RT_HASH_LOCK_SZ 2048 +#elif NR_CPUS >= 8 +#define RT_HASH_LOCK_SZ 1024 +#elif NR_CPUS >= 4 +#define RT_HASH_LOCK_SZ 512 +#else +#define RT_HASH_LOCK_SZ 256 +#endif + +static spinlock_t *rt_hash_locks; +# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] +# define rt_hash_lock_init() { \ + int i; \ + rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ + if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ + for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ + spin_lock_init(&rt_hash_locks[i]); \ + } +#else +# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_init() +#endif static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; @@ -587,7 +617,7 @@ static void rt_check_expire(unsigned long dummy) i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - spin_lock(&rt_hash_table[i].lock); + spin_lock(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ @@ -620,7 +650,7 @@ static void rt_check_expire(unsigned long dummy) rt_free(rth); #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } - spin_unlock(&rt_hash_table[i].lock); + spin_unlock(rt_hash_lock_addr(i)); /* Fallback loop breaker. */ if (time_after(jiffies, now)) @@ -643,11 +673,11 @@ static void rt_run_flush(unsigned long dummy) get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(&rt_hash_table[i].lock); + spin_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; - spin_unlock_bh(&rt_hash_table[i].lock); + spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth; rth = next) { next = rth->u.rt_next; @@ -780,7 +810,7 @@ static int rt_garbage_collect(void) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; - spin_lock_bh(&rt_hash_table[k].lock); + spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -812,7 +842,7 @@ static int rt_garbage_collect(void) goal--; #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } - spin_unlock_bh(&rt_hash_table[k].lock); + spin_unlock_bh(rt_hash_lock_addr(k)); if (goal <= 0) break; } @@ -882,7 +912,7 @@ restart: rthp = &rt_hash_table[hash].chain; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (!(rth->u.dst.flags & DST_BALANCED) && @@ -908,7 +938,7 @@ restart: rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); *rp = rth; @@ -949,7 +979,7 @@ restart: if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); @@ -990,7 +1020,7 @@ restart: } #endif rt_hash_table[hash].chain = rt; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; } @@ -1058,7 +1088,7 @@ static void rt_del(unsigned hash, struct rtable *rt) { struct rtable **rthp; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) @@ -1067,7 +1097,7 @@ static void rt_del(unsigned hash, struct rtable *rt) rt_free(rt); break; } - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); } void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, @@ -3073,7 +3103,7 @@ __setup("rhash_entries=", set_rhash_entries); int __init ip_rt_init(void) { - int i, order, goal, rc = 0; + int order, goal, rc = 0; rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); @@ -3122,10 +3152,8 @@ int __init ip_rt_init(void) /* NOTHING */; rt_hash_mask--; - for (i = 0; i <= rt_hash_mask; i++) { - spin_lock_init(&rt_hash_table[i].lock); - rt_hash_table[i].chain = NULL; - } + memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); + rt_hash_lock_init(); ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); ip_rt_max_size = (rt_hash_mask + 1) * 16; -- cgit v0.10.2 From 424c4b70cc4ff3930ee36a2ef7b204e4d704fd26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Jul 2005 14:58:19 -0700 Subject: [IPV4]: Use the fancy alloc_large_system_hash() function for route hash table - rt hash table allocated using alloc_large_system_hash() function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/route.c b/net/ipv4/route.c index daf82f8..9fcbb1b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -3103,12 +3104,14 @@ __setup("rhash_entries=", set_rhash_entries); int __init ip_rt_init(void) { - int order, goal, rc = 0; + int rc = 0; rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); #ifdef CONFIG_NET_CLS_ROUTE + { + int order; for (order = 0; (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) /* NOTHING */; @@ -3116,6 +3119,7 @@ int __init ip_rt_init(void) if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); memset(ip_rt_acct, 0, PAGE_SIZE << order); + } #endif ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", @@ -3126,32 +3130,17 @@ int __init ip_rt_init(void) if (!ipv4_dst_ops.kmem_cachep) panic("IP: failed to allocate ip_dst_cache\n"); - goal = num_physpages >> (26 - PAGE_SHIFT); - if (rhash_entries) - goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; - for (order = 0; (1UL << order) < goal; order++) - /* NOTHING */; - - do { - rt_hash_mask = (1UL << order) * PAGE_SIZE / - sizeof(struct rt_hash_bucket); - while (rt_hash_mask & (rt_hash_mask - 1)) - rt_hash_mask--; - rt_hash_table = (struct rt_hash_bucket *) - __get_free_pages(GFP_ATOMIC, order); - } while (rt_hash_table == NULL && --order > 0); - - if (!rt_hash_table) - panic("Failed to allocate IP route cache hash table\n"); - - printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", - rt_hash_mask, - (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); - - for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) - /* NOTHING */; - - rt_hash_mask--; + rt_hash_table = (struct rt_hash_bucket *) + alloc_large_system_hash("IP route cache", + sizeof(struct rt_hash_bucket), + rhash_entries, + (num_physpages >= 128 * 1024) ? + (27 - PAGE_SHIFT) : + (29 - PAGE_SHIFT), + HASH_HIGHMEM, + &rt_hash_log, + &rt_hash_mask, + 0); memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); -- cgit v0.10.2 From bb1d23b02657f494dff295f6cdd1f29df30fa61e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Jul 2005 15:00:32 -0700 Subject: [IPV4]: Bug fix in rt_check_expire() - rt_check_expire() fixes (an overflow occured if size of the hash was >= 65536) reminder of the bugfix: The rt_check_expire() has a serious problem on machines with large route caches, and a standard HZ value of 1000. With default values, ie ip_rt_gc_interval = 60*HZ = 60000 ; the loop count : for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; overflows (t is a 31 bit value) as soon rt_hash_log is >= 16 (65536 slots in route cache hash table). In this case, rt_check_expire() does nothing at all Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9fcbb1b..726ea5e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -54,7 +54,7 @@ * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file - * Eric Dumazet : hashed spinlocks + * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -606,18 +606,25 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, /* This runs via a timer and thus is always in BH context. */ static void rt_check_expire(unsigned long dummy) { - static int rover; - int i = rover, t; + static unsigned int rover; + unsigned int i = rover, goal; struct rtable *rth, **rthp; unsigned long now = jiffies; - - for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; - t -= ip_rt_gc_timeout) { + u64 mult; + + mult = ((u64)ip_rt_gc_interval) << rt_hash_log; + if (ip_rt_gc_timeout > 1) + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; + if (*rthp == 0) + continue; spin_lock(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { @@ -658,7 +665,7 @@ static void rt_check_expire(unsigned long dummy) break; } rover = i; - mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); + mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter -- cgit v0.10.2 From db1322b8012e1a8ad711c04813817328cff46718 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 5 Jul 2005 15:01:25 -0700 Subject: [DECNET]: Fix memset overflow on 64bit archs while dumping decnet routing rules Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 9934b25..99bc061 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) if (t < s_t) continue; if (t > s_t) - memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); tb = dn_fib_get_table(t, 0); if (tb == NULL) continue; -- cgit v0.10.2 From 2f36895aa774cf4d1c3d68921e0209e796b66600 Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Tue, 5 Jul 2005 15:02:40 -0700 Subject: [IPV4]: More broken memory allocation fixes for fib_trie Below a patch to preallocate memory when doing resize of trie (inflate halve) If preallocations fails it just skips the resize of this tnode for this time. The oops we got when killing bgpd (with full routing) is now gone. Patrick memory patch is also used. Signed-off-by: Robert Olsson Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9038b91..4be234c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -43,7 +43,7 @@ * 2 of the License, or (at your option) any later version. */ -#define VERSION "0.324" +#define VERSION "0.325" #include #include @@ -136,6 +136,7 @@ struct trie_use_stats { unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; + unsigned int resize_node_skipped; }; #endif @@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); -static struct tnode *inflate(struct trie *t, struct tnode *tn); -static struct tnode *halve(struct trie *t, struct tnode *tn); +static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); +static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); @@ -481,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w static struct node *resize(struct trie *t, struct tnode *tn) { int i; + int err = 0; if (!tn) return NULL; @@ -577,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn) */ check_tnode(tn); - + + err = 0; while ((tn->full_children > 0 && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold * tnode_child_length(tn))) { - tn = inflate(t, tn); + tn = inflate(t, tn, &err); + + if(err) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } } check_tnode(tn); @@ -591,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn) * Halve as long as the number of empty children in this * node is above threshold. */ + + err = 0; while (tn->bits > 1 && 100 * (tnode_child_length(tn) - tn->empty_children) < - halve_threshold * tnode_child_length(tn)) + halve_threshold * tnode_child_length(tn)) { + + tn = halve(t, tn, &err); + + if(err) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } + } - tn = halve(t, tn); /* Only one child remains */ @@ -620,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) return (struct node *) tn; } -static struct tnode *inflate(struct trie *t, struct tnode *tn) +static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) { struct tnode *inode; struct tnode *oldtnode = tn; @@ -632,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) - trie_bug("tnode_new failed"); + if (!tn) { + *err = -ENOMEM; + return oldtnode; + } + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and inflate + * of tnode is ignored. + */ + + for(i = 0; i < olen; i++) { + struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); + + if (inode && + IS_TNODE(inode) && + inode->pos == oldtnode->pos + oldtnode->bits && + inode->bits > 1) { + struct tnode *left, *right; + + t_key m = TKEY_GET_MASK(inode->pos, 1); + + left = tnode_new(inode->key&(~m), inode->pos + 1, + inode->bits - 1); + + if(!left) { + *err = -ENOMEM; + break; + } + + right = tnode_new(inode->key|m, inode->pos + 1, + inode->bits - 1); + + if(!right) { + *err = -ENOMEM; + break; + } + + put_child(t, tn, 2*i, (struct node *) left); + put_child(t, tn, 2*i+1, (struct node *) right); + } + } + + if(*err) { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if( tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + *err = -ENOMEM; + return oldtnode; + } for(i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); @@ -646,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if(IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, + if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1) == 0) put_child(t, tn, 2*i, node); else @@ -686,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) * the position (inode->pos) */ - t_key m = TKEY_GET_MASK(inode->pos, 1); - /* Use the old key, but set the new significant * bit to zero. */ - left = tnode_new(inode->key&(~m), inode->pos + 1, - inode->bits - 1); - if(!left) - trie_bug("tnode_new failed"); - - - /* Use the old key, but set the new significant - * bit to one. - */ - right = tnode_new(inode->key|m, inode->pos + 1, - inode->bits - 1); + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); + + if(!left) + BUG(); + + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); + + if(!right) + BUG(); - if(!right) - trie_bug("tnode_new failed"); - size = tnode_child_length(left); for(j = 0; j < size; j++) { put_child(t, left, j, inode->child[j]); @@ -722,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) return tn; } -static struct tnode *halve(struct trie *t, struct tnode *tn) +static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) { struct tnode *oldtnode = tn; struct node *left, *right; @@ -733,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); - if(!tn) - trie_bug("tnode_new failed"); + if (!tn) { + *err = -ENOMEM; + return oldtnode; + } + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and halve + * of tnode is ignored. + */ + + for(i = 0; i < olen; i += 2) { + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* Two nonempty children */ + if( left && right) { + struct tnode *newBinNode = + tnode_new(left->key, tn->pos + tn->bits, 1); + + if(!newBinNode) { + *err = -ENOMEM; + break; + } + put_child(t, tn, i/2, (struct node *)newBinNode); + } + } + + if(*err) { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if( tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + *err = -ENOMEM; + return oldtnode; + } for(i = 0; i < olen; i += 2) { left = tnode_get_child(oldtnode, i); @@ -751,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) /* Two nonempty children */ else { struct tnode *newBinNode = - tnode_new(left->key, tn->pos + tn->bits, 1); + (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); if(!newBinNode) - trie_bug("tnode_new failed"); + BUG(); put_child(t, newBinNode, 0, left); put_child(t, newBinNode, 1, right); @@ -2322,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq) seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); + seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); #ifdef CLEAR_STATS memset(&(t->stats), 0, sizeof(t->stats)); #endif -- cgit v0.10.2 From bc971dee6ece1fd0d431948924becd9c50e7b778 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Jul 2005 15:03:46 -0700 Subject: [SHAPER]: Switch to spinlocks. Dave, you were right and the sleeping locks in shaper were broken. Markus Kanet noticed this and also tested the patch below that switches locking to spinlocks. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c index 20edeb3..3ad0b67 100644 --- a/drivers/net/shaper.c +++ b/drivers/net/shaper.c @@ -135,10 +135,8 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct shaper *shaper = dev->priv; struct sk_buff *ptr; - - if (down_trylock(&shaper->sem)) - return -1; - + + spin_lock(&shaper->lock); ptr=shaper->sendq.prev; /* @@ -232,7 +230,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) shaper->stats.collisions++; } shaper_kick(shaper); - up(&shaper->sem); + spin_unlock(&shaper->lock); return 0; } @@ -271,11 +269,9 @@ static void shaper_timer(unsigned long data) { struct shaper *shaper = (struct shaper *)data; - if (!down_trylock(&shaper->sem)) { - shaper_kick(shaper); - up(&shaper->sem); - } else - mod_timer(&shaper->timer, jiffies); + spin_lock(&shaper->lock); + shaper_kick(shaper); + spin_unlock(&shaper->lock); } /* @@ -332,21 +328,6 @@ static void shaper_kick(struct shaper *shaper) /* - * Flush the shaper queues on a closedown - */ - -static void shaper_flush(struct shaper *shaper) -{ - struct sk_buff *skb; - - down(&shaper->sem); - while((skb=skb_dequeue(&shaper->sendq))!=NULL) - dev_kfree_skb(skb); - shaper_kick(shaper); - up(&shaper->sem); -} - -/* * Bring the interface up. We just disallow this until a * bind. */ @@ -375,7 +356,15 @@ static int shaper_open(struct net_device *dev) static int shaper_close(struct net_device *dev) { struct shaper *shaper=dev->priv; - shaper_flush(shaper); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&shaper->sendq)) != NULL) + dev_kfree_skb(skb); + + spin_lock_bh(&shaper->lock); + shaper_kick(shaper); + spin_unlock_bh(&shaper->lock); + del_timer_sync(&shaper->timer); return 0; } @@ -576,6 +565,7 @@ static void shaper_init_priv(struct net_device *dev) init_timer(&sh->timer); sh->timer.function=shaper_timer; sh->timer.data=(unsigned long)sh; + spin_lock_init(&sh->lock); } /* diff --git a/include/linux/if_shaper.h b/include/linux/if_shaper.h index 004e6f0..68c896a 100644 --- a/include/linux/if_shaper.h +++ b/include/linux/if_shaper.h @@ -23,7 +23,7 @@ struct shaper __u32 shapeclock; unsigned long recovery; /* Time we can next clock a packet out on an empty queue */ - struct semaphore sem; + spinlock_t lock; struct net_device_stats stats; struct net_device *dev; int (*hard_start_xmit) (struct sk_buff *skb, -- cgit v0.10.2 From 52609c0b56d7c8dfb6e16ec0a715adf8fcbdae36 Mon Sep 17 00:00:00 2001 From: David Chau Date: Tue, 5 Jul 2005 15:11:06 -0700 Subject: [NET]: improve readability of dev_set_promiscuity() in net/core/dev.c A trivial patch to improve the readability of dev_set_promiscuity() in net/core/dev.c. New code does exactly the same thing as original code. Signed-off-by: David Chau Signed-off-by: Domen Puncer Signed-off-by: David S. Miller diff --git a/net/core/dev.c b/net/core/dev.c index 7016e0c..7f5f62c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; - dev->flags |= IFF_PROMISC; if ((dev->promiscuity += inc) == 0) dev->flags &= ~IFF_PROMISC; - if (dev->flags ^ old_flags) { + else + dev->flags |= IFF_PROMISC; + if (dev->flags != old_flags) { dev_mc_upload(dev); printk(KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : -- cgit v0.10.2 From b8259d9ad1d0f8d0c5ea0e37bb15080b0bd395b5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Jul 2005 15:12:04 -0700 Subject: [NET]: Remove __ARGS from include/net/slhc_vj.h I suspect "#define __ARGS(x) ()" was deprecated before I was born. Signed-off-by: Alexey Dobriyan Signed-off-by: Domen Puncer Signed-off-by: David S. Miller diff --git a/include/net/slhc_vj.h b/include/net/slhc_vj.h index 0b2c278..8716d59 100644 --- a/include/net/slhc_vj.h +++ b/include/net/slhc_vj.h @@ -170,19 +170,14 @@ struct slcompress { }; #define NULLSLCOMPR (struct slcompress *)0 -#define __ARGS(x) x - /* In slhc.c: */ -struct slcompress *slhc_init __ARGS((int rslots, int tslots)); -void slhc_free __ARGS((struct slcompress *comp)); - -int slhc_compress __ARGS((struct slcompress *comp, unsigned char *icp, - int isize, unsigned char *ocp, unsigned char **cpp, - int compress_cid)); -int slhc_uncompress __ARGS((struct slcompress *comp, unsigned char *icp, - int isize)); -int slhc_remember __ARGS((struct slcompress *comp, unsigned char *icp, - int isize)); -int slhc_toss __ARGS((struct slcompress *comp)); +struct slcompress *slhc_init(int rslots, int tslots); +void slhc_free(struct slcompress *comp); + +int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, + unsigned char *ocp, unsigned char **cpp, int compress_cid); +int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize); +int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize); +int slhc_toss(struct slcompress *comp); #endif /* _SLHC_H */ -- cgit v0.10.2 From c65f7f00c587828e3d50737805a78f74804972de Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:17:25 -0700 Subject: [TCP]: Simplify SKB data portion allocation with NETIF_F_SG. The ideal and most optimal layout for an SKB when doing scatter-gather is to put all the headers at skb->data, and all the user data in the page array. This makes SKB splitting and combining extremely simple, especially before a packet goes onto the wire the first time. So, when sk_stream_alloc_pskb() is given a zero size, make sure there is no skb_tailroom(). This is achieved by applying SKB_DATA_ALIGN() to the header length used here. Next, make select_size() in TCP output segmentation use a length of zero when NETIF_F_SG is true on the outgoing interface. Signed-off-by: David S. Miller diff --git a/include/net/sock.h b/include/net/sock.h index e593af5..7b76f89 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1134,13 +1134,16 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, int size, int mem, int gfp) { - struct sk_buff *skb = alloc_skb(size + sk->sk_prot->max_header, gfp); + struct sk_buff *skb; + int hdr_len; + hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); + skb = alloc_skb(size + hdr_len, gfp); if (skb) { skb->truesize += mem; if (sk->sk_forward_alloc >= (int)skb->truesize || sk_stream_mem_schedule(sk, skb->truesize, 0)) { - skb_reserve(skb, sk->sk_prot->max_header); + skb_reserve(skb, hdr_len); return skb; } __kfree_skb(skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 882436d..be35415 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -756,13 +756,9 @@ static inline int select_size(struct sock *sk, struct tcp_sock *tp) { int tmp = tp->mss_cache_std; - if (sk->sk_route_caps & NETIF_F_SG) { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + if (sk->sk_route_caps & NETIF_F_SG) + tmp = 0; - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; - } return tmp; } @@ -872,11 +868,6 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } else if (page) { - /* If page is cached, align - * offset to L1 cache boundary - */ - off = (off + L1_CACHE_BYTES - 1) & - ~(L1_CACHE_BYTES - 1); if (off == PAGE_SIZE) { put_page(page); TCP_PAGE(sk) = page = NULL; -- cgit v0.10.2 From fc6415bcb0f58f03adb910e56d7e1df6368794e0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:17:45 -0700 Subject: [TCP]: Fix quick-ack decrementing with TSO. On each packet output, we call tcp_dec_quickack_mode() if the ACK flag is set. It drops tp->ack.quick until it hits zero, at which time we deflate the ATO value. When doing TSO, we are emitting multiple packets with ACK set, so we should decrement tp->ack.quick that many segments. Note that, unlike this case, tcp_enter_cwr() should not take the tcp_skb_pcount(skb) into consideration. That function, one time, readjusts tp->snd_cwnd and moves into TCP_CA_CWR state. Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index ec9e20c..afe41c5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -721,11 +721,16 @@ static inline int tcp_ack_scheduled(struct tcp_sock *tp) return tp->ack.pending&TCP_ACK_SCHED; } -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp) +static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts) { - if (tp->ack.quick && --tp->ack.quick == 0) { - /* Leaving quickack mode we deflate ATO. */ - tp->ack.ato = TCP_ATO_MIN; + if (tp->ack.quick) { + if (pkts >= tp->ack.quick) { + tp->ack.quick = 0; + + /* Leaving quickack mode we deflate ATO. */ + tp->ack.ato = TCP_ATO_MIN; + } else + tp->ack.quick -= pkts; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e17c24..389deeb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, tp->ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk) +static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { struct tcp_sock *tp = tcp_sk(sk); - tcp_dec_quickack_mode(tp); + tcp_dec_quickack_mode(tp, pkts); tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } @@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) tp->af_specific->send_check(sk, th, skb->len, skb); if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); -- cgit v0.10.2 From f6302d1d78f77c2d4c8bd32b0afc2df7fdf5f281 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:03 -0700 Subject: [TCP]: Move send test logic out of net/tcp.h This just moves the code into tcp_output.c, no code logic changes are made by this patch. Using this as a baseline, we can begin to untangle the mess of comparisons for the Nagle test et al. We will also be able to reduce all of the redundant computation that occurs when outputting data packets. Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index afe41c5..f2b1045 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -849,6 +849,9 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ extern int tcp_write_xmit(struct sock *, int nonagle); +extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle); +extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_xmit_retransmit_queue(struct sock *); extern void tcp_simple_retransmit(struct sock *); @@ -1284,12 +1287,6 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp) return 3; } -static __inline__ int tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml,tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, const struct sk_buff *skb) { @@ -1297,122 +1294,18 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } -/* Return 0, if packet can be sent now without violation Nagle's rules: - 1. It is full sized. - 2. Or it contains FIN. - 3. Or TCP_NODELAY was set. - 4. Or TCP_CORK is not set, and all sent packets are ACKed. - With Minshall's modification: all sent small packets are ACKed. - */ - -static __inline__ int -tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, - unsigned mss_now, int nonagle) -{ - return (skb->len < mss_now && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); -} - -extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *); - -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. - */ -static __inline__ int tcp_snd_test(struct sock *sk, - struct sk_buff *skb, - unsigned cur_mss, int nonagle) -{ - struct tcp_sock *tp = tcp_sk(sk); - int pkts = tcp_skb_pcount(skb); - - if (!pkts) { - tcp_set_skb_tso_segs(sk, skb); - pkts = tcp_skb_pcount(skb); - } - - /* RFC 1122 - section 4.2.3.4 - * - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) There are packets in flight and we have a small segment - * [SWS avoidance and Nagle algorithm] - * (part of SWS is done on packetization) - * Minshall version sounds: there are no _small_ - * segments in flight. (tcp_nagle_check) - * c) We have too many packets 'in flight' - * - * Don't use the nagle rule for urgent data (or - * for the final FIN -DaveM). - * - * Also, Nagle rule does not apply to frames, which - * sit in the middle of queue (they have no chances - * to get new data) and if room at tail of skb is - * not enough to save something seriously (<32 for now). - */ - - /* Don't be strict about the congestion window for the - * final FIN frame. -DaveM - */ - return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode - || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && - (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); -} - static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out && !tp->pending) tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); } -static __inline__ int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -/* Push out any pending frames which were held back due to - * TCP_CORK or attempt at coalescing tiny packets. - * The socket must be locked by the caller. - */ -static __inline__ void __tcp_push_pending_frames(struct sock *sk, - struct tcp_sock *tp, - unsigned cur_mss, - int nonagle) -{ - struct sk_buff *skb = sk->sk_send_head; - - if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) - tcp_check_probe_timer(sk, tp); - } - tcp_cwnd_validate(sk, tp); -} - static __inline__ void tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp) { __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } -static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; - - return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), - tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle)); -} - static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 389deeb..2cbe879 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -413,6 +413,135 @@ static inline void tcp_tso_set_push(struct sk_buff *skb) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; } +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (skb->len <= tp->mss_cache_std || + !(sk->sk_route_caps & NETIF_F_TSO)) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + } else { + unsigned int factor; + + factor = skb->len + (tp->mss_cache_std - 1); + factor /= tp->mss_cache_std; + skb_shinfo(skb)->tso_segs = factor; + skb_shinfo(skb)->tso_size = tp->mss_cache_std; + } +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. + */ +static int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + int pkts = tcp_skb_pcount(skb); + + if (!pkts) { + tcp_set_skb_tso_segs(sk, skb); + pkts = tcp_skb_pcount(skb); + } + + /* RFC 1122 - section 4.2.3.4 + * + * We must queue if + * + * a) The right edge of this frame exceeds the window + * b) There are packets in flight and we have a small segment + * [SWS avoidance and Nagle algorithm] + * (part of SWS is done on packetization) + * Minshall version sounds: there are no _small_ + * segments in flight. (tcp_nagle_check) + * c) We have too many packets 'in flight' + * + * Don't use the nagle rule for urgent data (or + * for the final FIN -DaveM). + * + * Also, Nagle rule does not apply to frames, which + * sit in the middle of queue (they have no chances + * to get new data) and if room at tail of skb is + * not enough to save something seriously (<32 for now). + */ + + /* Don't be strict about the congestion window for the + * final FIN frame. -DaveM + */ + return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode + || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && + (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && + !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb) { + if (!tcp_skb_is_last(sk, skb)) + nonagle = TCP_NAGLE_PUSH; + if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || + tcp_write_xmit(sk, nonagle)) + tcp_check_probe_timer(sk, tp); + } + tcp_cwnd_validate(sk, tp); +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + + /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ @@ -434,27 +563,6 @@ void tcp_push_one(struct sock *sk, unsigned cur_mss) } } -void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (skb->len <= tp->mss_cache_std || - !(sk->sk_route_caps & NETIF_F_TSO)) { - /* Avoid the costly divide in the normal - * non-TSO case. - */ - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; - } else { - unsigned int factor; - - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; - skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; - } -} - /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. -- cgit v0.10.2 From 84d3e7b9573291a1ea845bdd51b74bb484597661 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:18 -0700 Subject: [TCP]: Move __tcp_data_snd_check into tcp_output.c It reimplements portions of tcp_snd_check(), so it we move it to tcp_output.c we can consolidate it's logic much easier in a later change. Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index f2b1045..4888f9d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -849,6 +849,7 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ extern int tcp_write_xmit(struct sock *, int nonagle); +extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, unsigned cur_mss, int nonagle); extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7bbbbc3..5774243 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3346,16 +3346,6 @@ static inline void tcp_check_space(struct sock *sk) } } -static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - static __inline__ void tcp_data_snd_check(struct sock *sk) { struct sk_buff *skb = sk->sk_send_head; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2cbe879..362b811 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -530,6 +530,16 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, tcp_cwnd_validate(sk, tp); } +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) { struct sk_buff *skb = sk->sk_send_head; -- cgit v0.10.2 From f44b527177d57ed382bfd93e1b55232465f6d058 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:34 -0700 Subject: [TCP]: Add missing skb_header_release() call to tcp_fragment(). When we add any new packet to the TCP socket write queue, we must call skb_header_release() on it in order for the TSO sharing checks in the drivers to work. Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 362b811..5e63ed0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -655,6 +655,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) } /* Link BUFF into the send queue. */ + skb_header_release(buff); __skb_append(skb, buff); return 0; -- cgit v0.10.2 From a762a9800752f05fa8768bb0ac35d0e7f1bcfe7f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:51 -0700 Subject: [TCP]: Kill extra cwnd validate in __tcp_push_pending_frames(). The tcp_cwnd_validate() function should only be invoked if we actually send some frames, yet __tcp_push_pending_frames() will always invoke it. tcp_write_xmit() does the call for us, so the call here can simply be removed. Also, tcp_write_xmit() can be marked static. Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index 4888f9d..f32e7ae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -848,7 +848,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ -extern int tcp_write_xmit(struct sock *, int nonagle); extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, unsigned cur_mss, int nonagle); @@ -868,6 +867,9 @@ extern void tcp_push_one(struct sock *, unsigned mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); +/* tcp_input.c */ +extern void tcp_cwnd_application_limited(struct sock *sk); + /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); extern void tcp_clear_xmit_timers(struct sock *); @@ -1234,28 +1236,6 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) tp->left_out = tp->sacked_out + tp->lost_out; } -extern void tcp_cwnd_application_limited(struct sock *sk); - -/* Congestion window validation. (RFC2861) */ - -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) -{ - __u32 packets_out = tp->packets_out; - - if (packets_out >= tp->snd_cwnd) { - /* Network is feed fully. */ - tp->snd_cwnd_used = 0; - tp->snd_cwnd_stamp = tcp_time_stamp; - } else { - /* Network starves. */ - if (tp->packets_out > tp->snd_cwnd_used) - tp->snd_cwnd_used = tp->packets_out; - - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) - tcp_cwnd_application_limited(sk); - } -} - /* Set slow start threshould and cwnd not falling to slow start */ static inline void __tcp_enter_cwr(struct tcp_sock *tp) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5e63ed0..a6375ca 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -511,35 +511,6 @@ static inline int tcp_skb_is_last(const struct sock *sk, return skb->next == (struct sk_buff *)&sk->sk_write_queue; } -/* Push out any pending frames which were held back due to - * TCP_CORK or attempt at coalescing tiny packets. - * The socket must be locked by the caller. - */ -void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle) -{ - struct sk_buff *skb = sk->sk_send_head; - - if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) - tcp_check_probe_timer(sk, tp); - } - tcp_cwnd_validate(sk, tp); -} - -void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) { struct sk_buff *skb = sk->sk_send_head; @@ -841,6 +812,26 @@ unsigned int tcp_current_mss(struct sock *sk, int large) return mss_now; } +/* Congestion window validation. (RFC2861) */ + +static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +{ + __u32 packets_out = tp->packets_out; + + if (packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; + + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + tcp_cwnd_application_limited(sk); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -848,7 +839,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); unsigned int mss_now; @@ -901,6 +892,34 @@ int tcp_write_xmit(struct sock *sk, int nonagle) return 0; } +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb) { + if (!tcp_skb_is_last(sk, skb)) + nonagle = TCP_NAGLE_PUSH; + if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || + tcp_write_xmit(sk, nonagle)) + tcp_check_probe_timer(sk, tp); + } +} + +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + /* This function returns the amount that we can raise the * usable window based on the following constraints * -- cgit v0.10.2 From 92df7b518dcb113de8bc2494e3cd275ad887f12b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:06 -0700 Subject: [TCP]: tcp_write_xmit() tabbing cleanup Put the main basic block of work at the top-level of tabbing, and mark the TCP_CLOSE test with unlikely(). Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a6375ca..2a8409c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -842,54 +842,54 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static int tcp_write_xmit(struct sock *sk, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; unsigned int mss_now; + int sent_pkts; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all * will be happy. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *skb; - int sent_pkts = 0; + if (unlikely(sk->sk_state == TCP_CLOSE)) + return 0; - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); - - while ((skb = sk->sk_send_head) && - tcp_snd_test(sk, skb, mss_now, - tcp_skb_is_last(sk, skb) ? nonagle : - TCP_NAGLE_PUSH)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) - break; - } - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk, 1); + sent_pkts = 0; + while ((skb = sk->sk_send_head) && + tcp_snd_test(sk, skb, mss_now, + tcp_skb_is_last(sk, skb) ? nonagle : + TCP_NAGLE_PUSH)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) break; + } - /* Advance the send_head. This one is sent out. - * This call will increment packets_out. - */ - update_send_head(sk, tp, skb); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; - tcp_minshall_update(tp, mss_now, skb); - sent_pkts = 1; - } + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + update_send_head(sk, tp, skb); - if (sent_pkts) { - tcp_cwnd_validate(sk, tp); - return 0; - } + tcp_minshall_update(tp, mss_now, skb); + sent_pkts = 1; + } - return !tp->packets_out && sk->sk_send_head; + if (sent_pkts) { + tcp_cwnd_validate(sk, tp); + return 0; } - return 0; + + return !tp->packets_out && sk->sk_send_head; } /* Push out any pending frames which were held back due to -- cgit v0.10.2 From a2e2a59c93cc8ba39caa9011c2573f429e40ccd9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:23 -0700 Subject: [TCP]: Fix redundant calculations of tcp_current_mss() tcp_write_xmit() uses tcp_current_mss(), but some of it's callers, namely __tcp_push_pending_frames(), already has this value available already. While we're here, fix the "cur_mss" argument to be "unsigned int" instead of plain "unsigned". Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index f32e7ae..9416236 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -850,7 +850,7 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle); + unsigned int cur_mss, int nonagle); extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); extern void tcp_xmit_retransmit_queue(struct sock *); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2a8409c..e292e11 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -839,11 +839,10 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -static int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss_now; int sent_pkts; /* If we are closed, the bytes will have to remain here. @@ -853,13 +852,6 @@ static int tcp_write_xmit(struct sock *sk, int nonagle) if (unlikely(sk->sk_state == TCP_CLOSE)) return 0; - - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); sent_pkts = 0; while ((skb = sk->sk_send_head) && tcp_snd_test(sk, skb, mss_now, @@ -897,7 +889,7 @@ static int tcp_write_xmit(struct sock *sk, int nonagle) * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle) + unsigned int cur_mss, int nonagle) { struct sk_buff *skb = sk->sk_send_head; @@ -905,7 +897,7 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, if (!tcp_skb_is_last(sk, skb)) nonagle = TCP_NAGLE_PUSH; if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) + tcp_write_xmit(sk, cur_mss, nonagle)) tcp_check_probe_timer(sk, tp); } } @@ -916,7 +908,7 @@ void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) + tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle)) tcp_check_probe_timer(sk, tp); } -- cgit v0.10.2 From 55c97f3e990c1ff63957c64f6cb10711a09fd70e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:38 -0700 Subject: [TCP]: Fix __tcp_push_pending_frames() 'nonagle' handling. 'nonagle' should be passed to the tcp_snd_test() function as 'TCP_NAGLE_PUSH' if we are checking an SKB not at the tail of the write_queue. This is because Nagle does not apply to such frames since we cannot possibly tack more data onto them. However, while doing this __tcp_push_pending_frames() makes all of the packets in the write_queue use this modified 'nonagle' value. Fix the bug and simplify this function by just calling tcp_write_xmit() directly if sk_send_head is non-NULL. As a result, we can now make tcp_data_snd_check() just call tcp_push_pending_frames() instead of the specialized __tcp_data_snd_check(). Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index 9416236..b192380 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -848,7 +848,6 @@ extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, /* tcp_output.c */ -extern void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb); extern void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, unsigned int cur_mss, int nonagle); extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5774243..b27be2f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3346,12 +3346,9 @@ static inline void tcp_check_space(struct sock *sk) } } -static __inline__ void tcp_data_snd_check(struct sock *sk) +static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { - struct sk_buff *skb = sk->sk_send_head; - - if (skb != NULL) - __tcp_data_snd_check(sk, skb); + tcp_push_pending_frames(sk, tp); tcp_check_space(sk); } @@ -3645,7 +3642,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -3711,7 +3708,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); if (!tcp_ack_scheduled(tp)) goto no_ack; } @@ -3789,7 +3786,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); return 0; @@ -4099,7 +4096,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } @@ -4290,7 +4287,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e292e11..ce1d7cf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -894,24 +894,11 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb = sk->sk_send_head; if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, cur_mss, nonagle)) + if (tcp_write_xmit(sk, cur_mss, nonagle)) tcp_check_probe_timer(sk, tp); } } -void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - /* This function returns the amount that we can raise the * usable window based on the following constraints * -- cgit v0.10.2 From 7f4dd0a9438c73cbb1c240ece31390cf2c57294e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:54 -0700 Subject: [TCP]: Break out tcp_snd_test() into it's constituent parts. tcp_snd_test() does several different things, use inline functions to express this more clearly. 1) It initializes the TSO count of SKB, if necessary. 2) It performs the Nagle test. 3) It makes sure the congestion window is adhered to. 4) It makes sure SKB fits into the send window. This cleanup also sets things up so that things like the available packets in the congestion window does not need to be calculated multiple times by packet sending loops such as tcp_write_xmit(). Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ce1d7cf..8327e5e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -434,6 +434,33 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) } } +/* Does SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + static inline int tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml,tp->snd_una) && @@ -442,7 +469,7 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp) /* Return 0, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. - * 2. Or it contains FIN. + * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_NODELAY was set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. @@ -453,56 +480,73 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, unsigned mss_now, int nonagle) { return (skb->len < mss_now && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && ((nonagle&TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); } -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. +/* Return non-zero if the Nagle test allows this packet to be + * sent now. */ -static int tcp_snd_test(struct sock *sk, struct sk_buff *skb, - unsigned cur_mss, int nonagle) +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) { - struct tcp_sock *tp = tcp_sk(sk); - int pkts = tcp_skb_pcount(skb); + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; - if (!pkts) { + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { tcp_set_skb_tso_segs(sk, skb); - pkts = tcp_skb_pcount(skb); + tso_segs = tcp_skb_pcount(skb); } + return tso_segs; +} - /* RFC 1122 - section 4.2.3.4 - * - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) There are packets in flight and we have a small segment - * [SWS avoidance and Nagle algorithm] - * (part of SWS is done on packetization) - * Minshall version sounds: there are no _small_ - * segments in flight. (tcp_nagle_check) - * c) We have too many packets 'in flight' - * - * Don't use the nagle rule for urgent data (or - * for the final FIN -DaveM). - * - * Also, Nagle rule does not apply to frames, which - * sit in the middle of queue (they have no chances - * to get new data) and if room at tail of skb is - * not enough to save something seriously (<32 for now). - */ +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; - /* Don't be strict about the congestion window for the - * final FIN frame. -DaveM - */ - return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode - || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && - (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); + tcp_init_tso_segs(sk, skb); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; } static inline int tcp_skb_is_last(const struct sock *sk, -- cgit v0.10.2 From aa93466bdfd901b926e033801f0b82b3eaa67be2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:20:09 -0700 Subject: [TCP]: Eliminate redundant computations in tcp_write_xmit(). tcp_snd_test() is run for every packet output by a single call to tcp_write_xmit(), but this is not necessary. For one, the congestion window space needs to only be calculated one time, then used throughout the duration of the loop. This cleanup also makes experimenting with different TSO packetization schemes much easier. Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8327e5e..0a4cd24 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -887,6 +887,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + unsigned int tso_segs, cwnd_quota; int sent_pkts; /* If we are closed, the bytes will have to remain here. @@ -896,19 +897,31 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) if (unlikely(sk->sk_state == TCP_CLOSE)) return 0; + skb = sk->sk_send_head; + if (unlikely(!skb)) + return 0; + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_cwnd_test(tp, skb); sent_pkts = 0; - while ((skb = sk->sk_send_head) && - tcp_snd_test(sk, skb, mss_now, - tcp_skb_is_last(sk, skb) ? nonagle : - TCP_NAGLE_PUSH)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) + + while (cwnd_quota >= tso_segs) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + break; + + if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_tso_set_push(skb); - if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) break; /* Advance the send_head. This one is sent out. @@ -917,10 +930,19 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) update_send_head(sk, tp, skb); tcp_minshall_update(tp, mss_now, skb); - sent_pkts = 1; + sent_pkts++; + + /* Do not optimize this to use tso_segs. If we chopped up + * the packet above, tso_segs will no longer be valid. + */ + cwnd_quota -= tcp_skb_pcount(skb); + skb = sk->sk_send_head; + if (!skb) + break; + tso_segs = tcp_init_tso_segs(sk, skb); } - if (sent_pkts) { + if (likely(sent_pkts)) { tcp_cwnd_validate(sk, tp); return 0; } -- cgit v0.10.2 From b4e26f5ea0dbdd1e813c5571fb467022d8eb948a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:20:27 -0700 Subject: [TCP]: Fix send-side cpu utiliziation regression. Only put user data purely to pages when doing TSO. The extra page allocations cause two problems: 1) Add the overhead of the page allocations themselves. 2) Make us do small user copies when we get to the end of the TCP socket cache page. It is still beneficial to purely use pages for TSO, so we will do it for that case. Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index be35415..2ba73bf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -756,8 +756,17 @@ static inline int select_size(struct sock *sk, struct tcp_sock *tp) { int tmp = tp->mss_cache_std; - if (sk->sk_route_caps & NETIF_F_SG) - tmp = 0; + if (sk->sk_route_caps & NETIF_F_SG) { + if (sk->sk_route_caps & NETIF_F_TSO) + tmp = 0; + else { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + + if (tmp >= pgbreak && + tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + tmp = pgbreak; + } + } return tmp; } -- cgit v0.10.2 From a56476962e92a6c389a1a561274d4a27607b7b5f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:20:41 -0700 Subject: [TCP]: Kill bogus comment above tcp_tso_acked(). Everything stated there is out of data. tcp_trim_skb() does adjust the available socket send buffer space and skb->truesize now. Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b27be2f..1dba7fd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) } } -/* There is one downside to this scheme. Although we keep the - * ACK clock ticking, adjusting packet counters and advancing - * congestion window, we do not liberate socket send buffer - * space. - * - * Mucking with skb->truesize and sk->sk_wmem_alloc et al. - * then making a write space wakeup callback is a possible - * future enhancement. WARNING: it is not trivial to make. - */ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, __u32 now, __s32 *seq_rtt) { -- cgit v0.10.2 From cb83199a29dc0408423d6df432f28cc67fcadaf4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:20:55 -0700 Subject: [TCP]: Do not call tcp_tso_acked() if no work to do. In tcp_clean_rtx_queue(), if the TSO packet is not even partially acked, do not waste time calling tcp_tso_acked(). Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1dba7fd..b948e4e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2038,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt * the other end. */ if (after(scb->end_seq, tp->snd_una)) { - if (tcp_skb_pcount(skb) > 1) + if (tcp_skb_pcount(skb) > 1 && + after(tp->snd_una, scb->seq)) acked |= tcp_tso_acked(sk, skb, now, &seq_rtt); break; -- cgit v0.10.2 From 0d9901df62fe4820aee86b49f1a074cdb5c6928e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:21:10 -0700 Subject: [TCP]: Break out send buffer expansion test. This makes it easier to understand, and allows easier tweaking of the heuristic later on. Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b948e4e..2ef2f35 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3300,6 +3300,28 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } +static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +{ + /* If the user specified a specific send buffer setting, do + * not modify it. + */ + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return 0; + + /* If we are under global TCP memory pressure, do not expand. */ + if (tcp_memory_pressure) + return 0; + + /* If we are under soft global TCP memory pressure, do not expand. */ + if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + return 0; + + /* If we filled the congestion window, do not expand. */ + if (tp->packets_out >= tp->snd_cwnd) + return 0; + + return 1; +} /* When incoming ACK allowed to free some skb from write_queue, * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket @@ -3311,10 +3333,7 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out < tp->snd_cwnd && - !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + if (tcp_should_expand_sndbuf(sk, tp)) { int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, -- cgit v0.10.2 From c1b4a7e69576d65efc31a8cea0714173c2841244 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:24:38 -0700 Subject: [TCP]: Move to new TSO segmenting scheme. Make TSO segment transmit size decisions at send time not earlier. The basic scheme is that we try to build as large a TSO frame as possible when pulling in the user data, but the size of the TSO frame output to the card is determined at transmit time. This is guided by tp->xmit_size_goal. It is always set to a multiple of MSS and tells sendmsg/sendpage how large an SKB to try and build. Later, tcp_write_xmit() and tcp_push_one() chop up the packet if necessary and conditions warrant. These routines can also decide to "defer" in order to wait for more ACKs to arrive and thus allow larger TSO frames to be emitted. A general observation is that TSO elongates the pipe, thus requiring a larger congestion window and larger buffering especially at the sender side. Therefore, it is important that applications 1) get a large enough socket send buffer (this is accomplished by our dynamic send buffer expansion code) 2) do large enough writes. Signed-off-by: David S. Miller diff --git a/include/linux/tcp.h b/include/linux/tcp.h index dfd93d0..e4fd82e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -286,7 +286,7 @@ struct tcp_sock { __u32 max_window; /* Maximal window ever seen from peer */ __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ - __u16 mss_cache_std; /* Like mss_cache, but without TSO */ + __u16 xmit_size_goal; /* Goal for segmenting output packets */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ __u8 ca_state; /* State of fast-retransmit machine */ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index b192380..a166918 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -862,7 +862,7 @@ extern int tcp_write_wakeup(struct sock *); extern void tcp_send_fin(struct sock *sk); extern void tcp_send_active_reset(struct sock *sk, int priority); extern int tcp_send_synack(struct sock *); -extern void tcp_push_one(struct sock *, unsigned mss_now); +extern void tcp_push_one(struct sock *, unsigned int mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); @@ -968,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long static inline void tcp_initialize_rcv_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - unsigned int hint = min(tp->advmss, tp->mss_cache_std); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd/2); hint = min(hint, TCP_MIN_RCVMSS); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2ba73bf..29894c7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse size_t psize, int flags) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; + int mss_now, size_goal; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; copied = 0; err = -EPIPE; @@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { + if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -652,7 +653,7 @@ new_segment: goto wait_for_memory; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } if (copy > size) @@ -693,7 +694,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -713,6 +714,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } out: @@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - int tmp = tp->mss_cache_std; + int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { if (sk->sk_route_caps & NETIF_F_TSO) @@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; - int mss_now; + int mss_now, size_goal; int err, copied; long timeo; @@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, skb = sk->sk_write_queue.prev; if (!sk->sk_send_head || - (copy = mss_now - skb->len) <= 0) { + (copy = size_goal - skb->len) <= 0) { new_segment: /* Allocate new segment. If the interface is SG, @@ -842,7 +845,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } /* Try to append data to the end of skb. */ @@ -937,7 +940,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -957,6 +960,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } } @@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rto = jiffies_to_usecs(tp->rto); info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); - info->tcpi_snd_mss = tp->mss_cache_std; + info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = tp->ack.rcv_mss; info->tcpi_unacked = tp->packets_out; @@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, switch (optname) { case TCP_MAXSEG: - val = tp->mss_cache_std; + val = tp->mss_cache; if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2ef2f35..8de2f10 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { - if (tp->mss_cache_std > 1460) + if (tp->mss_cache > 1460) cwnd = 2; else - cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; + cwnd = (tp->mss_cache > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; + tp->mss_cache = tp->mss_cache; } if (!tp->sacked_out) @@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (IsFack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache_std))) { + tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_should_expand_sndbuf(sk, tp)) { - int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ebf1123..62f62bb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; tp->ca_ops = &tcp_init_congestion_ops; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0a4cd24..fd3ce38 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1; * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */ -int sysctl_tcp_tso_win_divisor = 8; +int sysctl_tcp_tso_win_divisor = 3; static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) @@ -403,21 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk->sk_send_head = skb; } -static inline void tcp_tso_set_push(struct sk_buff *skb) -{ - /* Force push to be on for any TSO frames to workaround - * problems with busted implementations like Mac OS-X that - * hold off socket receive wakeups until push is seen. - */ - if (tcp_skb_pcount(skb) > 1) - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; -} - static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - if (skb->len <= tp->mss_cache_std || + if (skb->len <= tp->mss_cache || !(sk->sk_route_caps & NETIF_F_TSO)) { /* Avoid the costly divide in the normal * non-TSO case. @@ -427,164 +417,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) } else { unsigned int factor; - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; + factor = skb->len + (tp->mss_cache - 1); + factor /= tp->mss_cache; skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; - } -} - -/* Does SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) -{ - u32 end_seq = TCP_SKB_CB(skb)->end_seq; - - return !after(end_seq, tp->snd_una + tp->snd_wnd); -} - -/* Can at least one segment of SKB be sent right now, according to the - * congestion window rules? If so, return how many segments are allowed. - */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) -{ - u32 in_flight, cwnd; - - /* Don't be strict about the congestion window for the final FIN. */ - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) - return 1; - - in_flight = tcp_packets_in_flight(tp); - cwnd = tp->snd_cwnd; - if (in_flight < cwnd) - return (cwnd - in_flight); - - return 0; -} - -static inline int tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml,tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return 0, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ - -static inline int tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned mss_now, int nonagle) -{ - return (skb->len < mss_now && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); -} - -/* Return non-zero if the Nagle test allows this packet to be - * sent now. - */ -static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - /* Nagle rule does not apply to frames, which sit in the middle of the - * write_queue (they have no chances to get new data). - * - * This is implemented in the callers, where they modify the 'nonagle' - * argument based upon the location of SKB in the send queue. - */ - if (nonagle & TCP_NAGLE_PUSH) - return 1; - - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) - return 1; - - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) - return 1; - - return 0; -} - -/* This must be invoked the first time we consider transmitting - * SKB onto the wire. - */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - int tso_segs = tcp_skb_pcount(skb); - - if (!tso_segs) { - tcp_set_skb_tso_segs(sk, skb); - tso_segs = tcp_skb_pcount(skb); - } - return tso_segs; -} - -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(sk, skb); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && - !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -static inline int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; - - return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), - (tcp_skb_is_last(sk, skb) ? - TCP_NAGLE_PUSH : - tp->nonagle))); -} - - -/* Send _single_ skb sitting at the send head. This function requires - * true push pending frames to setup probe timer etc. - */ -void tcp_push_one(struct sock *sk, unsigned cur_mss) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; - - if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - sk->sk_send_head = NULL; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); - return; - } + skb_shinfo(skb)->tso_size = tp->mss_cache; } } @@ -791,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ tp->pmtu_cookie = pmtu; - tp->mss_cache = tp->mss_cache_std = mss_now; + tp->mss_cache = mss_now; return mss_now; } @@ -803,56 +639,47 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ - -unsigned int tcp_current_mss(struct sock *sk, int large) +unsigned int tcp_current_mss(struct sock *sk, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); - unsigned int do_large, mss_now; + u32 mss_now; + u16 xmit_size_goal; + int doing_tso = 0; + + mss_now = tp->mss_cache; + + if (large_allowed && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode) + doing_tso = 1; - mss_now = tp->mss_cache_std; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } - do_large = (large && - (sk->sk_route_caps & NETIF_F_TSO) && - !tp->urg_mode); + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - if (do_large) { - unsigned int large_mss, factor, limit; + xmit_size_goal = mss_now; - large_mss = 65535 - tp->af_specific->net_header_len - + if (doing_tso) { + xmit_size_goal = 65535 - + tp->af_specific->net_header_len - tp->ext_header_len - tp->tcp_header_len; - if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), - 68U - tp->tcp_header_len); - - factor = large_mss / mss_now; + if (tp->max_window && + (xmit_size_goal > (tp->max_window >> 1))) + xmit_size_goal = max((tp->max_window >> 1), + 68U - tp->tcp_header_len); - /* Always keep large mss multiple of real mss, but - * do not exceed 1/tso_win_divisor of the congestion window - * so we can keep the ACK clock ticking and minimize - * bursting. - */ - limit = tp->snd_cwnd; - if (sysctl_tcp_tso_win_divisor) - limit /= sysctl_tcp_tso_win_divisor; - limit = max(1U, limit); - if (factor > limit) - factor = limit; - - tp->mss_cache = mss_now * factor; - - mss_now = tp->mss_cache; + xmit_size_goal -= (xmit_size_goal % mss_now); } + tp->xmit_size_goal = xmit_size_goal; - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); return mss_now; } @@ -876,6 +703,251 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) } } +static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +{ + u32 window, cwnd_len; + + window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + cwnd_len = mss_now * cwnd; + return min(window, cwnd_len); +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { + tcp_set_skb_tso_segs(sk, skb); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* Return non-zero if the Nagle test allows this packet to be + * sent now. + */ +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; + + return 0; +} + +/* Does at least the first segment of SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; + + tcp_init_tso_segs(sk, skb); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + +/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like + * tcp_fragment() except that it may make several kinds of assumptions + * in order to speed up the splitting operation. In particular, we + * know that all the data is in scatter-gather pages, and that the + * packet has never been sent out before (and thus is not cloned). + */ +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) +{ + struct sk_buff *buff; + int nlen = skb->len - len; + u16 flags; + + /* All of a TSO frame must be composed of paged data. */ + BUG_ON(skb->len != skb->data_len); + + buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + if (unlikely(buff == NULL)) + return -ENOMEM; + + buff->truesize = nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb); + tcp_set_skb_tso_segs(sk, buff); + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + __skb_append(skb, buff); + + return 0; +} + +/* Try to defer sending, if possible, in order to minimize the amount + * of TSO splitting we do. View it as a kind of TSO Nagle test. + * + * This algorithm is from John Heffner. + */ +static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 send_win, cong_win, limit, in_flight; + + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 0; + + in_flight = tcp_packets_in_flight(tp); + + BUG_ON(tcp_skb_pcount(skb) <= 1 || + (tp->snd_cwnd <= in_flight)); + + send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + + /* From in_flight test above, we know that cwnd > in_flight. */ + cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + limit = min(send_win, cong_win); + + /* If sk_send_head can be sent fully now, just do it. */ + if (skb->len <= limit) + return 0; + + if (sysctl_tcp_tso_win_divisor) { + u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + + /* If at least some fraction of a window is available, + * just use it. + */ + chunk /= sysctl_tcp_tso_win_divisor; + if (limit >= chunk) + return 0; + } else { + /* Different approach, try not to defer past a single + * ACK. Receiver should ACK every other full sized + * frame, so if we have space for more than 3 frames + * then send now. + */ + if (limit > tcp_max_burst(tp) * tp->mss_cache) + return 0; + } + + /* Ok, it looks like it is advisable to defer. */ + return 1; +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -887,8 +959,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int tso_segs, cwnd_quota; - int sent_pkts; + unsigned int tso_segs, sent_pkts; + int cwnd_quota; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -903,24 +975,44 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) tso_segs = tcp_init_tso_segs(sk, skb); cwnd_quota = tcp_cwnd_test(tp, skb); + if (unlikely(!cwnd_quota)) + goto out; + sent_pkts = 0; + while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) { + BUG_ON(!tso_segs); - while (cwnd_quota >= tso_segs) { - if (unlikely(!tcp_nagle_test(tp, skb, mss_now, - (tcp_skb_is_last(sk, skb) ? - nonagle : TCP_NAGLE_PUSH)))) - break; + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + } else { + if (tcp_tso_should_defer(sk, tp, skb)) + break; + } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) - break; + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; - if (unlikely(skb->len > mss_now)) { + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (tso_fragment(sk, skb, limit)) + break; + } + } else if (unlikely(skb->len > mss_now)) { if (unlikely(tcp_fragment(sk, skb, mss_now))) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); + if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) break; @@ -936,6 +1028,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) * the packet above, tso_segs will no longer be valid. */ cwnd_quota -= tcp_skb_pcount(skb); + + BUG_ON(cwnd_quota < 0); + if (!cwnd_quota) + break; + skb = sk->sk_send_head; if (!skb) break; @@ -946,7 +1043,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) tcp_cwnd_validate(sk, tp); return 0; } - +out: return !tp->packets_out && sk->sk_send_head; } @@ -965,6 +1062,53 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, } } +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + unsigned int tso_segs, cwnd_quota; + + BUG_ON(!skb || skb->len < mss_now); + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); + + if (likely(cwnd_quota)) { + BUG_ON(!tso_segs); + + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; + + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (unlikely(tso_fragment(sk, skb, limit))) + return; + } + } else if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) + return; + } + + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + update_send_head(sk, tp, skb); + tcp_cwnd_validate(sk, tp); + return; + } + } +} + /* This function returns the amount that we can raise the * usable window based on the following constraints * @@ -1222,7 +1366,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) @@ -1284,7 +1427,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -1853,14 +1995,12 @@ int tcp_write_wakeup(struct sock *sk) if (sk->sk_route_caps & NETIF_F_TSO) { sock_set_flag(sk, SOCK_NO_LARGESEND); sk->sk_route_caps &= ~NETIF_F_TSO; - tp->mss_cache = tp->mss_cache_std; } } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9dac7fd..f6e288d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; -- cgit v0.10.2 From 63d886c96b2a580b1bf764de238ba3c63515b5ee Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 5 Jul 2005 15:29:16 -0700 Subject: [PKT_SCHED]: Blackhole queueing discipline Useful in combination with classful qdiscs to drop or temporary disable certain flows, e.g. one could block specific ds flows with dsmark. Unlike the noop qdisc it can be controlled by the user and statistic accounting is done. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/sched/Makefile b/net/sched/Makefile index 8f58cec..e48d0d4 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -4,7 +4,7 @@ obj-y := sch_generic.o -obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o obj-$(CONFIG_NET_ACT_POLICE) += police.o diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c new file mode 100644 index 0000000..81f0b83 --- /dev/null +++ b/net/sched/sch_blackhole.c @@ -0,0 +1,54 @@ +/* + * net/sched/sch_blackhole.c Black hole queue + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * + * Note: Quantum tunneling is not supported. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + qdisc_drop(skb, sch); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static struct Qdisc_ops blackhole_qdisc_ops = { + .id = "blackhole", + .priv_size = 0, + .enqueue = blackhole_enqueue, + .dequeue = blackhole_dequeue, + .owner = THIS_MODULE, +}; + +static int __init blackhole_module_init(void) +{ + return register_qdisc(&blackhole_qdisc_ops); +} + +static void __exit blackhole_module_exit(void) +{ + unregister_qdisc(&blackhole_qdisc_ops); +} + +module_init(blackhole_module_init) +module_exit(blackhole_module_exit) + +MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 908a75c17a9e5a888347c2c1d3572203d1b1c7db Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:43:58 -0700 Subject: [TCP]: Never TSO defer under periods of congestion. Congestion window recover after loss depends upon the fact that if we have a full MSS sized frame at the head of the send queue, we will send it. TSO deferral can defeat the ACK clocking necessary to exit cleanly from recovery. Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fd3ce38..e041d05 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -909,6 +909,9 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) return 0; + if (tp->ca_state != TCP_CA_Open) + return 0; + in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1 || -- cgit v0.10.2 From b2f571026594884e7a2a3f8bc6ad5c92e0703330 Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Tue, 5 Jul 2005 16:38:26 -0700 Subject: [IPV4]: Add LC-Trie implementation notes Signed-off-by: Robert Olsson Signed-off-by: David S. Miller diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.txt new file mode 100644 index 0000000..f50d0c6 --- /dev/null +++ b/Documentation/networking/fib_trie.txt @@ -0,0 +1,145 @@ + LC-trie implementation notes. + +Node types +---------- +leaf + An end node with data. This has a copy of the relevant key, along + with 'hlist' with routing table entries sorted by prefix length. + See struct leaf and struct leaf_info. + +trie node or tnode + An internal node, holding an array of child (leaf or tnode) pointers, + indexed through a subset of the key. See Level Compression. + +A few concepts explained +------------------------ +Bits (tnode) + The number of bits in the key segment used for indexing into the + child array - the "child index". See Level Compression. + +Pos (tnode) + The position (in the key) of the key segment used for indexing into + the child array. See Path Compression. + +Path Compression / skipped bits + Any given tnode is linked to from the child array of its parent, using + a segment of the key specified by the parent's "pos" and "bits" + In certain cases, this tnode's own "pos" will not be immediately + adjacent to the parent (pos+bits), but there will be some bits + in the key skipped over because they represent a single path with no + deviations. These "skipped bits" constitute Path Compression. + Note that the search algorithm will simply skip over these bits when + searching, making it necessary to save the keys in the leaves to + verify that they actually do match the key we are searching for. + +Level Compression / child arrays + the trie is kept level balanced moving, under certain conditions, the + children of a full child (see "full_children") up one level, so that + instead of a pure binary tree, each internal node ("tnode") may + contain an arbitrarily large array of links to several children. + Conversely, a tnode with a mostly empty child array (see empty_children) + may be "halved", having some of its children moved downwards one level, + in order to avoid ever-increasing child arrays. + +empty_children + the number of positions in the child array of a given tnode that are + NULL. + +full_children + the number of children of a given tnode that aren't path compressed. + (in other words, they aren't NULL or leaves and their "pos" is equal + to this tnode's "pos"+"bits"). + + (The word "full" here is used more in the sense of "complete" than + as the opposite of "empty", which might be a tad confusing.) + +Comments +--------- + +We have tried to keep the structure of the code as close to fib_hash as +possible to allow verification and help up reviewing. + +fib_find_node() + A good start for understanding this code. This function implements a + straightforward trie lookup. + +fib_insert_node() + Inserts a new leaf node in the trie. This is bit more complicated than + fib_find_node(). Inserting a new node means we might have to run the + level compression algorithm on part of the trie. + +trie_leaf_remove() + Looks up a key, deletes it and runs the level compression algorithm. + +trie_rebalance() + The key function for the dynamic trie after any change in the trie + it is run to optimize and reorganize. Tt will walk the trie upwards + towards the root from a given tnode, doing a resize() at each step + to implement level compression. + +resize() + Analyzes a tnode and optimizes the child array size by either inflating + or shrinking it repeatedly until it fullfills the criteria for optimal + level compression. This part follows the original paper pretty closely + and there may be some room for experimentation here. + +inflate() + Doubles the size of the child array within a tnode. Used by resize(). + +halve() + Halves the size of the child array within a tnode - the inverse of + inflate(). Used by resize(); + +fn_trie_insert(), fn_trie_delete(), fn_trie_select_default() + The route manipulation functions. Should conform pretty closely to the + corresponding functions in fib_hash. + +fn_trie_flush() + This walks the full trie (using nextleaf()) and searches for empty + leaves which have to be removed. + +fn_trie_dump() + Dumps the routing table ordered by prefix length. This is somewhat + slower than the corresponding fib_hash function, as we have to walk the + entire trie for each prefix length. In comparison, fib_hash is organized + as one "zone"/hash per prefix length. + +Locking +------- + +fib_lock is used for an RW-lock in the same way that this is done in fib_hash. +However, the functions are somewhat separated for other possible locking +scenarios. It might conceivably be possible to run trie_rebalance via RCU +to avoid read_lock in the fn_trie_lookup() function. + +Main lookup mechanism +--------------------- +fn_trie_lookup() is the main lookup function. + +The lookup is in its simplest form just like fib_find_node(). We descend the +trie, key segment by key segment, until we find a leaf. check_leaf() does +the fib_semantic_match in the leaf's sorted prefix hlist. + +If we find a match, we are done. + +If we don't find a match, we enter prefix matching mode. The prefix length, +starting out at the same as the key length, is reduced one step at a time, +and we backtrack upwards through the trie trying to find a longest matching +prefix. The goal is always to reach a leaf and get a positive result from the +fib_semantic_match mechanism. + +Inside each tnode, the search for longest matching prefix consists of searching +through the child array, chopping off (zeroing) the least significant "1" of +the child index until we find a match or the child index consists of nothing but +zeros. + +At this point we backtrack (t->stats.backtrack++) up the trie, continuing to +chop off part of the key in order to find the longest matching prefix. + +At this point we will repeatedly descend subtries to look for a match, and there +are some optimizations available that can provide us with "shortcuts" to avoid +descending into dead ends. Look for "HL_OPTIMIZE" sections in the code. + +To alleviate any doubts about the correctness of the route selection process, +a new netlink operation has been added. Look for NETLINK_FIB_LOOKUP, which +gives userland access to fib_lookup(). -- cgit v0.10.2