diff options
author | Scott Wood <scottwood@freescale.com> | 2013-12-14 01:15:24 (GMT) |
---|---|---|
committer | Scott Wood <scottwood@freescale.com> | 2013-12-14 01:15:24 (GMT) |
commit | b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b (patch) | |
tree | 87828aaf5f82c7042bfc0307bc4ac499e10f93fb /net/sched | |
parent | 22c782a4b14773fab7eab3c1db54ad7ad077e9b8 (diff) | |
parent | 78fd82238d0e5716578c326404184a27ba67fd6e (diff) | |
download | linux-fsl-qoriq-b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b.tar.xz |
Merge remote-tracking branch 'linus/master' into merge
Conflicts:
Documentation/hwmon/ina2xx
arch/powerpc/Kconfig
arch/powerpc/boot/dts/b4860emu.dts
arch/powerpc/boot/dts/b4qds.dtsi
arch/powerpc/boot/dts/fsl/b4si-post.dtsi
arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi
arch/powerpc/boot/dts/p1023rdb.dts
arch/powerpc/boot/dts/t4240emu.dts
arch/powerpc/boot/dts/t4240qds.dts
arch/powerpc/configs/85xx/p1023_defconfig
arch/powerpc/configs/corenet32_smp_defconfig
arch/powerpc/configs/corenet64_smp_defconfig
arch/powerpc/configs/mpc85xx_smp_defconfig
arch/powerpc/include/asm/cputable.h
arch/powerpc/include/asm/device.h
arch/powerpc/include/asm/epapr_hcalls.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/mpic.h
arch/powerpc/include/asm/pci.h
arch/powerpc/include/asm/ppc-opcode.h
arch/powerpc/include/asm/ppc_asm.h
arch/powerpc/include/asm/reg_booke.h
arch/powerpc/kernel/epapr_paravirt.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/prom.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup_32.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/swsusp_asm64.S
arch/powerpc/kernel/swsusp_booke.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/booke.h
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/e500.h
arch/powerpc/kvm/e500_emulate.c
arch/powerpc/kvm/e500mc.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/perf/e6500-pmu.c
arch/powerpc/platforms/85xx/Kconfig
arch/powerpc/platforms/85xx/Makefile
arch/powerpc/platforms/85xx/b4_qds.c
arch/powerpc/platforms/85xx/c293pcie.c
arch/powerpc/platforms/85xx/corenet_ds.c
arch/powerpc/platforms/85xx/corenet_ds.h
arch/powerpc/platforms/85xx/p1023_rds.c
arch/powerpc/platforms/85xx/p2041_rdb.c
arch/powerpc/platforms/85xx/p3041_ds.c
arch/powerpc/platforms/85xx/p4080_ds.c
arch/powerpc/platforms/85xx/p5020_ds.c
arch/powerpc/platforms/85xx/p5040_ds.c
arch/powerpc/platforms/85xx/smp.c
arch/powerpc/platforms/85xx/t4240_qds.c
arch/powerpc/platforms/Kconfig
arch/powerpc/sysdev/Makefile
arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c
arch/powerpc/sysdev/fsl_msi.c
arch/powerpc/sysdev/fsl_pci.c
arch/powerpc/sysdev/fsl_pci.h
arch/powerpc/sysdev/fsl_soc.h
arch/powerpc/sysdev/mpic.c
arch/powerpc/sysdev/mpic_timer.c
drivers/Kconfig
drivers/clk/Kconfig
drivers/clk/clk-ppc-corenet.c
drivers/cpufreq/Kconfig.powerpc
drivers/cpufreq/Makefile
drivers/cpufreq/ppc-corenet-cpufreq.c
drivers/crypto/caam/Kconfig
drivers/crypto/caam/Makefile
drivers/crypto/caam/ctrl.c
drivers/crypto/caam/desc_constr.h
drivers/crypto/caam/intern.h
drivers/crypto/caam/jr.c
drivers/crypto/caam/regs.h
drivers/dma/fsldma.c
drivers/hwmon/ina2xx.c
drivers/iommu/Kconfig
drivers/iommu/fsl_pamu.c
drivers/iommu/fsl_pamu.h
drivers/iommu/fsl_pamu_domain.c
drivers/iommu/fsl_pamu_domain.h
drivers/misc/Makefile
drivers/mmc/card/block.c
drivers/mmc/core/core.c
drivers/mmc/host/sdhci-esdhc.h
drivers/mmc/host/sdhci-pltfm.c
drivers/mtd/nand/fsl_ifc_nand.c
drivers/net/ethernet/freescale/gianfar.c
drivers/net/ethernet/freescale/gianfar.h
drivers/net/ethernet/freescale/gianfar_ethtool.c
drivers/net/phy/at803x.c
drivers/net/phy/phy_device.c
drivers/net/phy/vitesse.c
drivers/pci/msi.c
drivers/staging/Kconfig
drivers/staging/Makefile
drivers/uio/Kconfig
drivers/uio/Makefile
drivers/uio/uio.c
drivers/usb/host/ehci-fsl.c
drivers/vfio/Kconfig
drivers/vfio/Makefile
include/crypto/algapi.h
include/linux/iommu.h
include/linux/mmc/sdhci.h
include/linux/msi.h
include/linux/netdev_features.h
include/linux/phy.h
include/linux/skbuff.h
include/net/ip.h
include/uapi/linux/vfio.h
net/core/ethtool.c
net/ipv4/route.c
net/ipv6/route.c
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 24 | ||||
-rw-r--r-- | net/sched/Makefile | 2 | ||||
-rw-r--r-- | net/sched/act_mirred.c | 2 | ||||
-rw-r--r-- | net/sched/act_police.c | 4 | ||||
-rw-r--r-- | net/sched/cls_basic.c | 2 | ||||
-rw-r--r-- | net/sched/cls_bpf.c | 385 | ||||
-rw-r--r-- | net/sched/cls_cgroup.c | 43 | ||||
-rw-r--r-- | net/sched/em_ipset.c | 7 | ||||
-rw-r--r-- | net/sched/em_meta.c | 4 | ||||
-rw-r--r-- | net/sched/sch_api.c | 97 | ||||
-rw-r--r-- | net/sched/sch_atm.c | 1 | ||||
-rw-r--r-- | net/sched/sch_cbq.c | 3 | ||||
-rw-r--r-- | net/sched/sch_choke.c | 3 | ||||
-rw-r--r-- | net/sched/sch_drr.c | 2 | ||||
-rw-r--r-- | net/sched/sch_fq.c | 822 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 83 | ||||
-rw-r--r-- | net/sched/sch_hfsc.c | 2 | ||||
-rw-r--r-- | net/sched/sch_htb.c | 293 | ||||
-rw-r--r-- | net/sched/sch_mq.c | 2 | ||||
-rw-r--r-- | net/sched/sch_mqprio.c | 2 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 141 | ||||
-rw-r--r-- | net/sched/sch_qfq.c | 214 | ||||
-rw-r--r-- | net/sched/sch_tbf.c | 87 |
23 files changed, 1898 insertions, 327 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 235e01a..ad1f1d8 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_FQ + tristate "Fair Queue" + help + Say Y here if you want to use the FQ packet scheduling algorithm. + + FQ does flow separation, and is able to respect pacing requirements + set by TCP stack into sk->sk_pacing_rate (for localy generated + traffic) + + To compile this driver as a module, choose M here: the module + will be called sch_fq. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT @@ -429,6 +443,16 @@ config NET_CLS_CGROUP To compile this code as a module, choose M here: the module will be called cls_cgroup. +config NET_CLS_BPF + tristate "BPF-based classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + programmable BPF (JIT'ed) filters as an alternative to ematches. + + To compile this code as a module, choose M here: the module will + be called cls_bpf. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 978cbf0..35fa47a 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o @@ -49,6 +50,7 @@ obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o +obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5d676ed..977c10e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -243,7 +243,7 @@ nla_put_failure: static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; if (event == NETDEV_UNREGISTER) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 189e3c5..272d8e9 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -231,14 +231,14 @@ override: } if (R_tab) { police->rate_present = true; - psched_ratecfg_precompute(&police->rate, &R_tab->rate); + psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); qdisc_put_rtab(R_tab); } else { police->rate_present = false; } if (P_tab) { police->peak_present = true; - psched_ratecfg_precompute(&police->peak, &P_tab->rate); + psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); qdisc_put_rtab(P_tab); } else { police->peak_present = false; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d76a35d..636d913 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -137,7 +137,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *est) { - int err = -EINVAL; + int err; struct tcf_exts e; struct tcf_ematch_tree t; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c new file mode 100644 index 0000000..1002a82 --- /dev/null +++ b/net/sched/cls_bpf.c @@ -0,0 +1,385 @@ +/* + * Berkeley Packet Filter based traffic classifier + * + * Might be used to classify traffic through flexible, user-defined and + * possibly JIT-ed BPF filters for traffic control as an alternative to + * ematches. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("TC BPF based classifier"); + +struct cls_bpf_head { + struct list_head plist; + u32 hgen; +}; + +struct cls_bpf_prog { + struct sk_filter *filter; + struct sock_filter *bpf_ops; + struct tcf_exts exts; + struct tcf_result res; + struct list_head link; + u32 handle; + u16 bpf_len; +}; + +static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { + [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, + [TCA_BPF_OPS] = { .type = NLA_BINARY, + .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, +}; + +static const struct tcf_ext_map bpf_ext_map = { + .action = TCA_BPF_ACT, + .police = TCA_BPF_POLICE, +}; + +static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + int ret; + + list_for_each_entry(prog, &head->plist, link) { + int filter_res = SK_RUN_FILTER(prog->filter, skb); + + if (filter_res == 0) + continue; + + *res = prog->res; + if (filter_res != -1) + res->classid = filter_res; + + ret = tcf_exts_exec(skb, &prog->exts, res); + if (ret < 0) + continue; + + return ret; + } + + return -1; +} + +static int cls_bpf_init(struct tcf_proto *tp) +{ + struct cls_bpf_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + + INIT_LIST_HEAD(&head->plist); + tp->root = head; + + return 0; +} + +static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) +{ + tcf_unbind_filter(tp, &prog->res); + tcf_exts_destroy(tp, &prog->exts); + + sk_unattached_filter_destroy(prog->filter); + + kfree(prog->bpf_ops); + kfree(prog); +} + +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; + + list_for_each_entry(prog, &head->plist, link) { + if (prog == todel) { + tcf_tree_lock(tp); + list_del(&prog->link); + tcf_tree_unlock(tp); + + cls_bpf_delete_prog(tp, prog); + return 0; + } + } + + return -ENOENT; +} + +static void cls_bpf_destroy(struct tcf_proto *tp) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog, *tmp; + + list_for_each_entry_safe(prog, tmp, &head->plist, link) { + list_del(&prog->link); + cls_bpf_delete_prog(tp, prog); + } + + kfree(head); +} + +static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + unsigned long ret = 0UL; + + if (head == NULL) + return 0UL; + + list_for_each_entry(prog, &head->plist, link) { + if (prog->handle == handle) { + ret = (unsigned long) prog; + break; + } + } + + return ret; +} + +static void cls_bpf_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, + struct cls_bpf_prog *prog, + unsigned long base, struct nlattr **tb, + struct nlattr *est) +{ + struct sock_filter *bpf_ops, *bpf_old; + struct tcf_exts exts; + struct sock_fprog tmp; + struct sk_filter *fp, *fp_old; + u16 bpf_size, bpf_len; + u32 classid; + int ret; + + if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) + return -EINVAL; + + ret = tcf_exts_validate(net, tp, tb, est, &exts, &bpf_ext_map); + if (ret < 0) + return ret; + + classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]); + if (bpf_len > BPF_MAXINSNS || bpf_len == 0) { + ret = -EINVAL; + goto errout; + } + + bpf_size = bpf_len * sizeof(*bpf_ops); + bpf_ops = kzalloc(bpf_size, GFP_KERNEL); + if (bpf_ops == NULL) { + ret = -ENOMEM; + goto errout; + } + + memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); + + tmp.len = bpf_len; + tmp.filter = (struct sock_filter __user *) bpf_ops; + + ret = sk_unattached_filter_create(&fp, &tmp); + if (ret) + goto errout_free; + + tcf_tree_lock(tp); + fp_old = prog->filter; + bpf_old = prog->bpf_ops; + + prog->bpf_len = bpf_len; + prog->bpf_ops = bpf_ops; + prog->filter = fp; + prog->res.classid = classid; + tcf_tree_unlock(tp); + + tcf_bind_filter(tp, &prog->res, base); + tcf_exts_change(tp, &prog->exts, &exts); + + if (fp_old) + sk_unattached_filter_destroy(fp_old); + if (bpf_old) + kfree(bpf_old); + + return 0; + +errout_free: + kfree(bpf_ops); +errout: + tcf_exts_destroy(tp, &exts); + return ret; +} + +static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, + struct cls_bpf_head *head) +{ + unsigned int i = 0x80000000; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && cls_bpf_get(tp, head->hgen)); + if (i == 0) + pr_err("Insufficient number of handles\n"); + + return i; +} + +static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; + struct nlattr *tb[TCA_BPF_MAX + 1]; + int ret; + + if (tca[TCA_OPTIONS] == NULL) + return -EINVAL; + + ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy); + if (ret < 0) + return ret; + + if (prog != NULL) { + if (handle && prog->handle != handle) + return -EINVAL; + return cls_bpf_modify_existing(net, tp, prog, base, tb, + tca[TCA_RATE]); + } + + prog = kzalloc(sizeof(*prog), GFP_KERNEL); + if (prog == NULL) + return -ENOBUFS; + + if (handle == 0) + prog->handle = cls_bpf_grab_new_handle(tp, head); + else + prog->handle = handle; + if (prog->handle == 0) { + ret = -EINVAL; + goto errout; + } + + ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE]); + if (ret < 0) + goto errout; + + tcf_tree_lock(tp); + list_add(&prog->link, &head->plist); + tcf_tree_unlock(tp); + + *arg = (unsigned long) prog; + + return 0; +errout: + if (*arg == 0UL && prog) + kfree(prog); + + return ret; +} + +static int cls_bpf_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *tm) +{ + struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; + struct nlattr *nest, *nla; + + if (prog == NULL) + return skb->len; + + tm->tcm_handle = prog->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) + goto nla_put_failure; + if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len)) + goto nla_put_failure; + + nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len * + sizeof(struct sock_filter)); + if (nla == NULL) + goto nla_put_failure; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + if (tcf_exts_dump(skb, &prog->exts, &bpf_ext_map) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &prog->exts, &bpf_ext_map) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_bpf_head *head = tp->root; + struct cls_bpf_prog *prog; + + list_for_each_entry(prog, &head->plist, link) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) prog, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static struct tcf_proto_ops cls_bpf_ops __read_mostly = { + .kind = "bpf", + .owner = THIS_MODULE, + .classify = cls_bpf_classify, + .init = cls_bpf_init, + .destroy = cls_bpf_destroy, + .get = cls_bpf_get, + .put = cls_bpf_put, + .change = cls_bpf_change, + .delete = cls_bpf_delete, + .walk = cls_bpf_walk, + .dump = cls_bpf_dump, +}; + +static int __init cls_bpf_init_mod(void) +{ + return register_tcf_proto_ops(&cls_bpf_ops); +} + +static void __exit cls_bpf_exit_mod(void) +{ + unregister_tcf_proto_ops(&cls_bpf_ops); +} + +module_init(cls_bpf_init_mod); +module_exit(cls_bpf_exit_mod); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3a294eb..16006c9 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,19 +23,18 @@ #include <net/sock.h> #include <net/cls_cgroup.h> -static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return container_of(task_subsys_state(p, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css_cls_state(task_css(p, net_cls_subsys_id)); } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; @@ -45,17 +44,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - if (cgrp->parent) - cgrp_cls_state(cgrp)->classid = - cgrp_cls_state(cgrp->parent)->classid; + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); + + if (parent) + cs->classid = parent->classid; return 0; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_cls_state(cgrp)); + kfree(css_cls_state(css)); } static int update_classid(const void *v, struct file *file, unsigned n) @@ -67,27 +68,29 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; - void *v; + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); - v = (void *)(unsigned long)task_cls_classid(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); } } -static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp_cls_state(cgrp)->classid; + return css_cls_state(css)->classid; } -static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) { - cgrp_cls_state(cgrp)->classid = (u32) value; + css_cls_state(css)->classid = (u32) value; return 0; } diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index 938b7cb..527aeb7 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -24,11 +24,12 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, { struct xt_set_info *set = data; ip_set_id_t index; + struct net *net = dev_net(qdisc_dev(tp->q)); if (data_len != sizeof(*set)) return -EINVAL; - index = ip_set_nfnl_get_byindex(set->index); + index = ip_set_nfnl_get_byindex(net, set->index); if (index == IPSET_INVALID_ID) return -ENOENT; @@ -37,7 +38,7 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, if (em->data) return 0; - ip_set_nfnl_put(index); + ip_set_nfnl_put(net, index); return -ENOMEM; } @@ -45,7 +46,7 @@ static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) { const struct xt_set_info *set = (const void *) em->data; if (set) { - ip_set_nfnl_put(set->index); + ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index); kfree((void *) em->data); } } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 7c3de6f..e5cef956 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -793,8 +793,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len, goto errout; meta = kzalloc(sizeof(*meta), GFP_KERNEL); - if (meta == NULL) + if (meta == NULL) { + err = -ENOMEM; goto errout; + } memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 281c1bd..cd81505 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops) } EXPORT_SYMBOL(unregister_qdisc); +/* Get default qdisc if not otherwise specified */ +void qdisc_get_default(char *name, size_t len) +{ + read_lock(&qdisc_mod_lock); + strlcpy(name, default_qdisc_ops->id, len); + read_unlock(&qdisc_mod_lock); +} + +static struct Qdisc_ops *qdisc_lookup_default(const char *name) +{ + struct Qdisc_ops *q = NULL; + + for (q = qdisc_base; q; q = q->next) { + if (!strcmp(name, q->id)) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + + return q; +} + +/* Set new default qdisc to use */ +int qdisc_set_default(const char *name) +{ + const struct Qdisc_ops *ops; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + write_lock(&qdisc_mod_lock); + ops = qdisc_lookup_default(name); + if (!ops) { + /* Not found, drop lock and try to load module */ + write_unlock(&qdisc_mod_lock); + request_module("sch_%s", name); + write_lock(&qdisc_mod_lock); + + ops = qdisc_lookup_default(name); + } + + if (ops) { + /* Set new default */ + module_put(default_qdisc_ops->owner); + default_qdisc_ops = ops; + } + write_unlock(&qdisc_mod_lock); + + return ops ? 0 : -ENOENT; +} + /* We know handle. Find qdisc among all qdisc's attached to device (root qdisc, all its children, children of children etc.) */ @@ -285,6 +337,45 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) return q; } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value. The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell. If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ + int low = roundup(r->mpu, 48); + int high = roundup(low+1, 48); + int cell_low = low >> r->cell_log; + int cell_high = (high >> r->cell_log) - 1; + + /* rtab is too inaccurate at rates > 100Mbit/s */ + if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { + pr_debug("TC linklayer: Giving up ATM detection\n"); + return TC_LINKLAYER_ETHERNET; + } + + if ((cell_high > cell_low) && (cell_high < 256) + && (rtab[cell_low] == rtab[cell_high])) { + pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", + cell_low, cell_high, rtab[cell_high]); + return TC_LINKLAYER_ATM; + } + return TC_LINKLAYER_ETHERNET; +} + static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) @@ -308,6 +399,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); + if (r->linklayer == TC_LINKLAYER_UNAWARE) + r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } @@ -644,9 +737,11 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; + int drops; if (n == 0) return; + drops = max_t(int, n, 0); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) return; @@ -663,6 +758,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; + sch->qstats.drops += drops; } } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -1813,6 +1909,7 @@ static int __init pktsched_init(void) return err; } + register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index ca8e0a5..1f9c314 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -605,6 +605,7 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, struct sockaddr_atmpvc pvc; int state; + memset(&pvc, 0, sizeof(pvc)); pvc.sap_family = AF_ATMPVC; pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; pvc.sap_addr.vpi = flow->vcc->vpi; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 1bc210f..7a42c81 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -130,7 +130,7 @@ struct cbq_class { psched_time_t penalized; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; struct tcf_proto *filter_list; @@ -1465,6 +1465,7 @@ static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_wrropt opt; + memset(&opt, 0, sizeof(opt)); opt.flags = 0; opt.allot = cl->allot; opt.priority = cl->priority + 1; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ef53ab8..ddd73cb 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -438,7 +438,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); + ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), + GFP_KERNEL | __GFP_NOWARN); if (!ntab) ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); if (!ntab) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index fb09e88..95cb9e7 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -37,7 +37,7 @@ struct drr_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct list_head alist; struct Qdisc *qdisc; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c new file mode 100644 index 0000000..95d8439 --- /dev/null +++ b/net/sched/sch_fq.c @@ -0,0 +1,822 @@ +/* + * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) + * + * Copyright (C) 2013 Eric Dumazet <edumazet@google.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Meant to be mostly used for localy generated traffic : + * Fast classification depends on skb->sk being set before reaching us. + * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. + * All packets belonging to a socket are considered as a 'flow'. + * + * Flows are dynamically allocated and stored in a hash table of RB trees + * They are also part of one Round Robin 'queues' (new or old flows) + * + * Burst avoidance (aka pacing) capability : + * + * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a + * bunch of packets, and this packet scheduler adds delay between + * packets to respect rate limitation. + * + * enqueue() : + * - lookup one RB tree (out of 1024 or more) to find the flow. + * If non existent flow, create it, add it to the tree. + * Add skb to the per flow list of skb (fifo). + * - Use a special fifo for high prio packets + * + * dequeue() : serves flows in Round Robin + * Note : When a flow becomes empty, we do not immediately remove it from + * rb trees, for performance reasons (its expected to send additional packets, + * or SLAB cache will reuse socket for another flow) + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/hash.h> +#include <linux/prefetch.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sock.h> +#include <net/tcp_states.h> + +/* + * Per flow structure, dynamically allocated + */ +struct fq_flow { + struct sk_buff *head; /* list of skbs for this flow : first skb */ + union { + struct sk_buff *tail; /* last skb in the list */ + unsigned long age; /* jiffies when flow was emptied, for gc */ + }; + struct rb_node fq_node; /* anchor in fq_root[] trees */ + struct sock *sk; + int qlen; /* number of packets in flow queue */ + int credit; + u32 socket_hash; /* sk_hash */ + struct fq_flow *next; /* next pointer in RR lists, or &detached */ + + struct rb_node rate_node; /* anchor in q->delayed tree */ + u64 time_next_packet; +}; + +struct fq_flow_head { + struct fq_flow *first; + struct fq_flow *last; +}; + +struct fq_sched_data { + struct fq_flow_head new_flows; + + struct fq_flow_head old_flows; + + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + + struct fq_flow internal; /* for non classified or high prio packets */ + u32 quantum; + u32 initial_quantum; + u32 flow_refill_delay; + u32 flow_max_rate; /* optional max rate per flow */ + u32 flow_plimit; /* max packets per flow */ + struct rb_root *fq_root; + u8 rate_enable; + u8 fq_trees_log; + + u32 flows; + u32 inactive_flows; + u32 throttled_flows; + + u64 stat_gc_flows; + u64 stat_internal_packets; + u64 stat_tcp_retrans; + u64 stat_throttled; + u64 stat_flows_plimit; + u64 stat_pkts_too_long; + u64 stat_allocation_errors; + struct qdisc_watchdog watchdog; +}; + +/* special value to mark a detached flow (not on old/new list) */ +static struct fq_flow detached, throttled; + +static void fq_flow_set_detached(struct fq_flow *f) +{ + f->next = &detached; + f->age = jiffies; +} + +static bool fq_flow_is_detached(const struct fq_flow *f) +{ + return f->next == &detached; +} + +static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ + struct rb_node **p = &q->delayed.rb_node, *parent = NULL; + + while (*p) { + struct fq_flow *aux; + + parent = *p; + aux = container_of(parent, struct fq_flow, rate_node); + if (f->time_next_packet >= aux->time_next_packet) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&f->rate_node, parent, p); + rb_insert_color(&f->rate_node, &q->delayed); + q->throttled_flows++; + q->stat_throttled++; + + f->next = &throttled; + if (q->time_next_delayed_flow > f->time_next_packet) + q->time_next_delayed_flow = f->time_next_packet; +} + + +static struct kmem_cache *fq_flow_cachep __read_mostly; + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ + if (head->first) + head->last->next = flow; + else + head->first = flow; + head->last = flow; + flow->next = NULL; +} + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*HZ) + +static bool fq_gc_candidate(const struct fq_flow *f) +{ + return fq_flow_is_detached(f) && + time_after(jiffies, f->age + FQ_GC_AGE); +} + +static void fq_gc(struct fq_sched_data *q, + struct rb_root *root, + struct sock *sk) +{ + struct fq_flow *f, *tofree[FQ_GC_MAX]; + struct rb_node **p, *parent; + int fcnt = 0; + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) + break; + + if (fq_gc_candidate(f)) { + tofree[fcnt++] = f; + if (fcnt == FQ_GC_MAX) + break; + } + + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; + while (fcnt) { + struct fq_flow *f = tofree[--fcnt]; + + rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + } +} + +static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +{ + struct rb_node **p, *parent; + struct sock *sk = skb->sk; + struct rb_root *root; + struct fq_flow *f; + + /* warning: no starvation prevention... */ + if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) + return &q->internal; + + if (unlikely(!sk)) { + /* By forcing low order bit to 1, we make sure to not + * collide with a local flow (socket pointers are word aligned) + */ + sk = (struct sock *)(skb_get_rxhash(skb) | 1L); + } + + root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + + if (q->flows >= (2U << q->fq_trees_log) && + q->inactive_flows > q->flows/2) + fq_gc(q, root, sk); + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) { + /* socket might have been reallocated, so check + * if its sk_hash is the same. + * It not, we need to refill credit with + * initial quantum + */ + if (unlikely(skb->sk && + f->socket_hash != sk->sk_hash)) { + f->credit = q->initial_quantum; + f->socket_hash = sk->sk_hash; + f->time_next_packet = 0ULL; + } + return f; + } + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!f)) { + q->stat_allocation_errors++; + return &q->internal; + } + fq_flow_set_detached(f); + f->sk = sk; + if (skb->sk) + f->socket_hash = sk->sk_hash; + f->credit = q->initial_quantum; + + rb_link_node(&f->fq_node, parent, p); + rb_insert_color(&f->fq_node, root); + + q->flows++; + q->inactive_flows++; + return f; +} + + +/* remove one skb from head of flow queue */ +static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + flow->qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + sch->q.qlen--; + } + return skb; +} + +/* We might add in the future detection of retransmits + * For the time being, just return false + */ +static bool skb_is_retransmit(struct sk_buff *skb) +{ + return false; +} + +/* add skb to flow queue + * flow queue is a linked list, kind of FIFO, except for TCP retransmits + * We special case tcp retransmits to be transmitted before other packets. + * We rely on fact that TCP retransmits are unlikely, so we do not waste + * a separate queue or a pointer. + * head-> [retrans pkt 1] + * [retrans pkt 2] + * [ normal pkt 1] + * [ normal pkt 2] + * [ normal pkt 3] + * tail-> [ normal pkt 4] + */ +static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) +{ + struct sk_buff *prev, *head = flow->head; + + skb->next = NULL; + if (!head) { + flow->head = skb; + flow->tail = skb; + return; + } + if (likely(!skb_is_retransmit(skb))) { + flow->tail->next = skb; + flow->tail = skb; + return; + } + + /* This skb is a tcp retransmit, + * find the last retrans packet in the queue + */ + prev = NULL; + while (skb_is_retransmit(head)) { + prev = head; + head = head->next; + if (!head) + break; + } + if (!prev) { /* no rtx packet in queue, become the new head */ + skb->next = flow->head; + flow->head = skb; + } else { + if (prev == flow->tail) + flow->tail = skb; + else + skb->next = prev->next; + prev->next = skb; + } +} + +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct fq_flow *f; + + if (unlikely(sch->q.qlen >= sch->limit)) + return qdisc_drop(skb, sch); + + f = fq_classify(skb, q); + if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch); + } + + f->qlen++; + if (skb_is_retransmit(skb)) + q->stat_tcp_retrans++; + sch->qstats.backlog += qdisc_pkt_len(skb); + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); + q->inactive_flows--; + qdisc_unthrottled(sch); + } + + /* Note: this overwrites f->age */ + flow_queue_add(f, skb); + + if (unlikely(f == &q->internal)) { + q->stat_internal_packets++; + qdisc_unthrottled(sch); + } + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static void fq_check_throttled(struct fq_sched_data *q, u64 now) +{ + struct rb_node *p; + + if (q->time_next_delayed_flow > now) + return; + + q->time_next_delayed_flow = ~0ULL; + while ((p = rb_first(&q->delayed)) != NULL) { + struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + + if (f->time_next_packet > now) { + q->time_next_delayed_flow = f->time_next_packet; + break; + } + rb_erase(p, &q->delayed); + q->throttled_flows--; + fq_flow_add_tail(&q->old_flows, f); + } +} + +static struct sk_buff *fq_dequeue(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct fq_flow_head *head; + struct sk_buff *skb; + struct fq_flow *f; + u32 rate; + + skb = fq_dequeue_head(sch, &q->internal); + if (skb) + goto out; + fq_check_throttled(q, now); +begin: + head = &q->new_flows; + if (!head->first) { + head = &q->old_flows; + if (!head->first) { + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_ns(&q->watchdog, + q->time_next_delayed_flow); + return NULL; + } + } + f = head->first; + + if (f->credit <= 0) { + f->credit += q->quantum; + head->first = f->next; + fq_flow_add_tail(&q->old_flows, f); + goto begin; + } + + if (unlikely(f->head && now < f->time_next_packet)) { + head->first = f->next; + fq_flow_set_throttled(q, f); + goto begin; + } + + skb = fq_dequeue_head(sch, f); + if (!skb) { + head->first = f->next; + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && q->old_flows.first) { + fq_flow_add_tail(&q->old_flows, f); + } else { + fq_flow_set_detached(f); + q->inactive_flows++; + } + goto begin; + } + prefetch(&skb->end); + f->time_next_packet = now; + f->credit -= qdisc_pkt_len(skb); + + if (f->credit > 0 || !q->rate_enable) + goto out; + + rate = q->flow_max_rate; + if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) + rate = min(skb->sk->sk_pacing_rate, rate); + + if (rate != ~0U) { + u32 plen = max(qdisc_pkt_len(skb), q->quantum); + u64 len = (u64)plen * NSEC_PER_SEC; + + if (likely(rate)) + do_div(len, rate); + /* Since socket rate can change later, + * clamp the delay to 125 ms. + * TODO: maybe segment the too big skb, as in commit + * e43ac79a4bc ("sch_tbf: segment too big GSO packets") + */ + if (unlikely(len > 125 * NSEC_PER_MSEC)) { + len = 125 * NSEC_PER_MSEC; + q->stat_pkts_too_long++; + } + + f->time_next_packet = now + len; + } +out: + qdisc_bstats_update(sch, skb); + qdisc_unthrottled(sch); + return skb; +} + +static void fq_reset(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *root; + struct sk_buff *skb; + struct rb_node *p; + struct fq_flow *f; + unsigned int idx; + + while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) + kfree_skb(skb); + + if (!q->fq_root) + return; + + for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { + root = &q->fq_root[idx]; + while ((p = rb_first(root)) != NULL) { + f = container_of(p, struct fq_flow, fq_node); + rb_erase(p, root); + + while ((skb = fq_dequeue_head(sch, f)) != NULL) + kfree_skb(skb); + + kmem_cache_free(fq_flow_cachep, f); + } + } + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->flows = 0; + q->inactive_flows = 0; + q->throttled_flows = 0; +} + +static void fq_rehash(struct fq_sched_data *q, + struct rb_root *old_array, u32 old_log, + struct rb_root *new_array, u32 new_log) +{ + struct rb_node *op, **np, *parent; + struct rb_root *oroot, *nroot; + struct fq_flow *of, *nf; + int fcnt = 0; + u32 idx; + + for (idx = 0; idx < (1U << old_log); idx++) { + oroot = &old_array[idx]; + while ((op = rb_first(oroot)) != NULL) { + rb_erase(op, oroot); + of = container_of(op, struct fq_flow, fq_node); + if (fq_gc_candidate(of)) { + fcnt++; + kmem_cache_free(fq_flow_cachep, of); + continue; + } + nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + + np = &nroot->rb_node; + parent = NULL; + while (*np) { + parent = *np; + + nf = container_of(parent, struct fq_flow, fq_node); + BUG_ON(nf->sk == of->sk); + + if (nf->sk > of->sk) + np = &parent->rb_right; + else + np = &parent->rb_left; + } + + rb_link_node(&of->fq_node, parent, np); + rb_insert_color(&of->fq_node, nroot); + } + } + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; +} + +static int fq_resize(struct fq_sched_data *q, u32 log) +{ + struct rb_root *array; + u32 idx; + + if (q->fq_root && log == q->fq_trees_log) + return 0; + + array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); + if (!array) + return -ENOMEM; + + for (idx = 0; idx < (1U << log); idx++) + array[idx] = RB_ROOT; + + if (q->fq_root) { + fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); + kfree(q->fq_root); + } + q->fq_root = array; + q->fq_trees_log = log; + + return 0; +} + +static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, + [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, +}; + +static int fq_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_MAX + 1]; + int err, drop_count = 0; + u32 fq_log; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + fq_log = q->fq_trees_log; + + if (tb[TCA_FQ_BUCKETS_LOG]) { + u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); + + if (nval >= 1 && nval <= ilog2(256*1024)) + fq_log = nval; + else + err = -EINVAL; + } + if (tb[TCA_FQ_PLIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + + if (tb[TCA_FQ_FLOW_PLIMIT]) + q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + + if (tb[TCA_FQ_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + + if (tb[TCA_FQ_INITIAL_QUANTUM]) + q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + + if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) + pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", + nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); + + if (tb[TCA_FQ_FLOW_MAX_RATE]) + q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + + if (tb[TCA_FQ_RATE_ENABLE]) { + u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); + + if (enable <= 1) + q->rate_enable = enable; + else + err = -EINVAL; + } + + if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { + u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; + + q->flow_refill_delay = usecs_to_jiffies(usecs_delay); + } + + if (!err) + err = fq_resize(q, fq_log); + + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_dequeue(sch); + + if (!skb) + break; + kfree_skb(skb); + drop_count++; + } + qdisc_tree_decrease_qlen(sch, drop_count); + + sch_tree_unlock(sch); + return err; +} + +static void fq_destroy(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + + fq_reset(sch); + kfree(q->fq_root); + qdisc_watchdog_cancel(&q->watchdog); +} + +static int fq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + int err; + + sch->limit = 10000; + q->flow_plimit = 100; + q->quantum = 2 * psched_mtu(qdisc_dev(sch)); + q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); + q->flow_refill_delay = msecs_to_jiffies(40); + q->flow_max_rate = ~0U; + q->rate_enable = 1; + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->fq_root = NULL; + q->fq_trees_log = ilog2(1024); + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) + err = fq_change(sch, opt); + else + err = fq_resize(q, q->fq_trees_log); + + return err; +} + +static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || + nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, + jiffies_to_usecs(q->flow_refill_delay)) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + goto nla_put_failure; + + nla_nest_end(skb, opts); + return skb->len; + +nla_put_failure: + return -1; +} + +static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct tc_fq_qd_stats st = { + .gc_flows = q->stat_gc_flows, + .highprio_packets = q->stat_internal_packets, + .tcp_retrans = q->stat_tcp_retrans, + .throttled = q->stat_throttled, + .flows_plimit = q->stat_flows_plimit, + .pkts_too_long = q->stat_pkts_too_long, + .allocation_errors = q->stat_allocation_errors, + .flows = q->flows, + .inactive_flows = q->inactive_flows, + .throttled_flows = q->throttled_flows, + .time_next_delayed_flow = q->time_next_delayed_flow - now, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops fq_qdisc_ops __read_mostly = { + .id = "fq", + .priv_size = sizeof(struct fq_sched_data), + + .enqueue = fq_enqueue, + .dequeue = fq_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_init, + .reset = fq_reset, + .destroy = fq_destroy, + .change = fq_change, + .dump = fq_dump, + .dump_stats = fq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_module_init(void) +{ + int ret; + + fq_flow_cachep = kmem_cache_create("fq_flow_cache", + sizeof(struct fq_flow), + 0, 0, NULL); + if (!fq_flow_cachep) + return -ENOMEM; + + ret = register_qdisc(&fq_qdisc_ops); + if (ret) + kmem_cache_destroy(fq_flow_cachep); + return ret; +} + +static void __exit fq_module_exit(void) +{ + unregister_qdisc(&fq_qdisc_ops); + kmem_cache_destroy(fq_flow_cachep); +} + +module_init(fq_module_init) +module_exit(fq_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cc54c80..819c7a3 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,10 +25,15 @@ #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/if_vlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> +/* Qdisc to use by default */ +const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; +EXPORT_SYMBOL(default_qdisc_ops); + /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with @@ -121,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq); + ret = dev_hard_start_xmit(skb, dev, txq, NULL); HARD_TX_UNLOCK(dev, txq); @@ -207,15 +212,19 @@ void __qdisc_run(struct Qdisc *q) unsigned long dev_trans_start(struct net_device *dev) { - unsigned long val, res = dev->trans_start; + unsigned long val, res; unsigned int i; + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + res = dev->trans_start; for (i = 0; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; if (val && time_after(val, res)) res = val; } dev->trans_start = res; + return res; } EXPORT_SYMBOL(dev_trans_start); @@ -540,12 +549,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops) + const struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; @@ -589,10 +597,14 @@ errout: } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, unsigned int parentid) + const struct Qdisc_ops *ops, + unsigned int parentid) { struct Qdisc *sch; + if (!try_module_get(ops->owner)) + goto errout; + sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) goto errout; @@ -696,7 +708,7 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->tx_queue_len) { qdisc = qdisc_create_dflt(dev_queue, - &pfifo_fast_ops, TC_H_ROOT); + default_qdisc_ops, TC_H_ROOT); if (!qdisc) { netdev_info(dev, "activation failed\n"); return; @@ -749,9 +761,8 @@ void dev_activate(struct net_device *dev) int need_watchdog; /* No queueing discipline is attached to device; - create default one i.e. pfifo_fast for devices, - which need queueing and noqueue_qdisc for - virtual interfaces + * create default one for devices, which need queueing + * and noqueue_qdisc for virtual interfaces */ if (dev->qdisc == &noop_qdisc) @@ -833,7 +844,7 @@ void dev_deactivate_many(struct list_head *head) struct net_device *dev; bool sync_needed = false; - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); if (dev_ingress_queue(dev)) @@ -852,7 +863,7 @@ void dev_deactivate_many(struct list_head *head) synchronize_net(); /* Wait for outstanding qdisc_run calls. */ - list_for_each_entry(dev, head, unreg_list) + list_for_each_entry(dev, head, close_list) while (some_qdisc_is_busy(dev)) yield(); } @@ -861,7 +872,7 @@ void dev_deactivate(struct net_device *dev) { LIST_HEAD(single); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); dev_deactivate_many(&single); list_del(&single); } @@ -914,39 +925,37 @@ void dev_shutdown(struct net_device *dev) } void psched_ratecfg_precompute(struct psched_ratecfg *r, - const struct tc_ratespec *conf) + const struct tc_ratespec *conf, + u64 rate64) { - u64 factor; - u64 mult; - int shift; - memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; - r->rate_bps = (u64)conf->rate << 3; + r->rate_bytes_ps = max_t(u64, conf->rate, rate64); + r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); r->mult = 1; /* - * Calibrate mult, shift so that token counting is accurate - * for smallest packet size (64 bytes). Token (time in ns) is - * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will - * work as long as the smallest packet transfer time can be - * accurately represented in nanosec. + * The deal here is to replace a divide by a reciprocal one + * in fast path (a reciprocal divide is a multiply and a shift) + * + * Normal formula would be : + * time_in_ns = (NSEC_PER_SEC * len) / rate_bps + * + * We compute mult/shift to use instead : + * time_in_ns = (len * mult) >> shift; + * + * We try to get the highest possible mult value for accuracy, + * but have to make sure no overflows will ever happen. */ - if (r->rate_bps > 0) { - /* - * Higher shift gives better accuracy. Find the largest - * shift such that mult fits in 32 bits. - */ - for (shift = 0; shift < 16; shift++) { - r->shift = shift; - factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); - mult = div64_u64(factor, r->rate_bps); - if (mult > UINT_MAX) + if (r->rate_bytes_ps > 0) { + u64 factor = NSEC_PER_SEC; + + for (;;) { + r->mult = div64_u64(factor, r->rate_bytes_ps); + if (r->mult & (1U << 31) || factor & (1ULL << 63)) break; + factor <<= 1; + r->shift++; } - - r->shift = shift - 1; - factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); - r->mult = div64_u64(factor, r->rate_bps); } } EXPORT_SYMBOL(psched_ratecfg_precompute); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 9facea0..c407561 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -114,7 +114,7 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ struct tcf_proto *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index adaedd7..0e1e38b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -65,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f module_param (htb_hysteresis, int, 0640); MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate"); +static int htb_rate_est = 0; /* htb classes have a default rate estimator */ +module_param(htb_rate_est, int, 0640); +MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes"); + /* used internaly to keep status of single class */ enum htb_cmode { HTB_CANT_SEND, /* class can't send and can't borrow */ @@ -72,95 +76,105 @@ enum htb_cmode { HTB_CAN_SEND /* class can send */ }; -/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_prio { + union { + struct rb_root row; + struct rb_root feed; + }; + struct rb_node *ptr; + /* When class changes from state 1->2 and disconnects from + * parent's feed then we lost ptr value and start from the + * first child again. Here we store classid of the + * last valid ptr (used when ptr is NULL). + */ + u32 last_ptr_id; +}; + +/* interior & leaf nodes; props specific to leaves are marked L: + * To reduce false sharing, place mostly read fields at beginning, + * and mostly written ones at the end. + */ struct htb_class { struct Qdisc_class_common common; - /* general class parameters */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; - struct tc_htb_xstats xstats; /* our special stats */ - int refcnt; /* usage count of this class */ + struct psched_ratecfg rate; + struct psched_ratecfg ceil; + s64 buffer, cbuffer;/* token bucket depth/rate */ + s64 mbuffer; /* max wait time */ + u32 prio; /* these two are used only by leaves... */ + int quantum; /* but stored for parent-to-leaf return */ + + struct tcf_proto *filter_list; /* class attached filters */ + int filter_cnt; + int refcnt; /* usage count of this class */ + + int level; /* our level (see above) */ + unsigned int children; + struct htb_class *parent; /* parent class */ + + struct gnet_stats_rate_est64 rate_est; - /* topology */ - int level; /* our level (see above) */ - unsigned int children; - struct htb_class *parent; /* parent class */ + /* + * Written often fields + */ + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct tc_htb_xstats xstats; /* our special stats */ - int prio; /* these two are used only by leaves... */ - int quantum; /* but stored for parent-to-leaf return */ + /* token bucket parameters */ + s64 tokens, ctokens;/* current number of tokens */ + s64 t_c; /* checkpoint time */ union { struct htb_class_leaf { - struct Qdisc *q; - int deficit[TC_HTB_MAXDEPTH]; struct list_head drop_list; + int deficit[TC_HTB_MAXDEPTH]; + struct Qdisc *q; } leaf; struct htb_class_inner { - struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ - struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ - /* When class changes from state 1->2 and disconnects from - * parent's feed then we lost ptr value and start from the - * first child again. Here we store classid of the - * last valid ptr (used when ptr is NULL). - */ - u32 last_ptr_id[TC_HTB_NUMPRIO]; + struct htb_prio clprio[TC_HTB_NUMPRIO]; } inner; } un; - struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ - struct rb_node pq_node; /* node for event queue */ - s64 pq_key; + s64 pq_key; - int prio_activity; /* for which prios are we active */ - enum htb_cmode cmode; /* current mode of the class */ - - /* class attached filters */ - struct tcf_proto *filter_list; - int filter_cnt; + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + struct rb_node pq_node; /* node for event queue */ + struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ +}; - /* token bucket parameters */ - struct psched_ratecfg rate; - struct psched_ratecfg ceil; - s64 buffer, cbuffer; /* token bucket depth/rate */ - s64 mbuffer; /* max wait time */ - s64 tokens, ctokens; /* current number of tokens */ - s64 t_c; /* checkpoint time */ +struct htb_level { + struct rb_root wait_pq; + struct htb_prio hprio[TC_HTB_NUMPRIO]; }; struct htb_sched { struct Qdisc_class_hash clhash; - struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ + int defcls; /* class where unclassified flows go to */ + int rate2quantum; /* quant = rate / rate2quantum */ - /* self list - roots of self generating tree */ - struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - int row_mask[TC_HTB_MAXDEPTH]; - struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - - /* self wait list - roots of wait PQs per row */ - struct rb_root wait_pq[TC_HTB_MAXDEPTH]; + /* filters for qdisc itself */ + struct tcf_proto *filter_list; - /* time of nearest event per level (row) */ - s64 near_ev_cache[TC_HTB_MAXDEPTH]; +#define HTB_WARN_TOOMANYEVENTS 0x1 + unsigned int warned; /* only one warning */ + int direct_qlen; + struct work_struct work; - int defcls; /* class where unclassified flows go to */ + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + long direct_pkts; - /* filters for qdisc itself */ - struct tcf_proto *filter_list; + struct qdisc_watchdog watchdog; - int rate2quantum; /* quant = rate / rate2quantum */ - s64 now; /* cached dequeue time */ - struct qdisc_watchdog watchdog; + s64 now; /* cached dequeue time */ + struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ - /* non shaped skbs; let them go directly thru */ - struct sk_buff_head direct_queue; - int direct_qlen; /* max qlen of above */ + /* time of nearest event per level (row) */ + s64 near_ev_cache[TC_HTB_MAXDEPTH]; - long direct_pkts; + int row_mask[TC_HTB_MAXDEPTH]; -#define HTB_WARN_TOOMANYEVENTS 0x1 - unsigned int warned; /* only one warning */ - struct work_struct work; + struct htb_level hlevel[TC_HTB_MAXDEPTH]; }; /* find class in global hash table using given handle */ @@ -276,7 +290,7 @@ static void htb_add_to_id_tree(struct rb_root *root, static void htb_add_to_wait_tree(struct htb_sched *q, struct htb_class *cl, s64 delay) { - struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; + struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL; cl->pq_key = q->now + delay; if (cl->pq_key == q->now) @@ -296,7 +310,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q, p = &parent->rb_left; } rb_link_node(&cl->pq_node, parent, p); - rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); + rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq); } /** @@ -323,7 +337,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q, while (mask) { int prio = ffz(~mask); mask &= ~(1 << prio); - htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio); + htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio); } } @@ -349,16 +363,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q, struct htb_class *cl, int mask) { int m = 0; + struct htb_level *hlevel = &q->hlevel[cl->level]; while (mask) { int prio = ffz(~mask); + struct htb_prio *hprio = &hlevel->hprio[prio]; mask &= ~(1 << prio); - if (q->ptr[cl->level][prio] == cl->node + prio) - htb_next_rb_node(q->ptr[cl->level] + prio); + if (hprio->ptr == cl->node + prio) + htb_next_rb_node(&hprio->ptr); - htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio); - if (!q->row[cl->level][prio].rb_node) + htb_safe_rb_erase(cl->node + prio, &hprio->row); + if (!hprio->row.rb_node) m |= 1 << prio; } q->row_mask[cl->level] &= ~m; @@ -382,13 +398,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.feed[prio].rb_node) + if (p->un.inner.clprio[prio].feed.rb_node) /* parent already has its feed in use so that * reset bit in mask as parent is already ok */ mask &= ~(1 << prio); - htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio); + htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); } p->prio_activity |= mask; cl = p; @@ -418,18 +434,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.ptr[prio] == cl->node + prio) { + if (p->un.inner.clprio[prio].ptr == cl->node + prio) { /* we are removing child which is pointed to from * parent feed - forget the pointer but remember * classid */ - p->un.inner.last_ptr_id[prio] = cl->common.classid; - p->un.inner.ptr[prio] = NULL; + p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; + p->un.inner.clprio[prio].ptr = NULL; } - htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio); + htb_safe_rb_erase(cl->node + prio, + &p->un.inner.clprio[prio].feed); - if (!p->un.inner.feed[prio].rb_node) + if (!p->un.inner.clprio[prio].feed.rb_node) mask |= 1 << prio; } @@ -644,7 +661,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, htb_change_class_mode(q, cl, &diff); if (old_mode != cl->cmode) { if (old_mode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq); if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); } @@ -664,7 +681,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, * next pending event (0 for no event in pq, q->now for too many events). * Note: Applied are events whose have cl->pq_key <= q->now. */ -static s64 htb_do_events(struct htb_sched *q, int level, +static s64 htb_do_events(struct htb_sched *q, const int level, unsigned long start) { /* don't run for longer than 2 jiffies; 2 is used instead of @@ -672,10 +689,12 @@ static s64 htb_do_events(struct htb_sched *q, int level, * too soon */ unsigned long stop_at = start + 2; + struct rb_root *wait_pq = &q->hlevel[level].wait_pq; + while (time_before(jiffies, stop_at)) { struct htb_class *cl; s64 diff; - struct rb_node *p = rb_first(&q->wait_pq[level]); + struct rb_node *p = rb_first(wait_pq); if (!p) return 0; @@ -684,7 +703,7 @@ static s64 htb_do_events(struct htb_sched *q, int level, if (cl->pq_key > q->now) return cl->pq_key; - htb_safe_rb_erase(p, q->wait_pq + level); + htb_safe_rb_erase(p, wait_pq); diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); htb_change_class_mode(q, cl, &diff); if (cl->cmode != HTB_CAN_SEND) @@ -728,8 +747,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, * * Find leaf where current feed pointers points to. */ -static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, - struct rb_node **pptr, u32 * pid) +static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) { int i; struct { @@ -738,10 +756,10 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, u32 *pid; } stk[TC_HTB_MAXDEPTH], *sp = stk; - BUG_ON(!tree->rb_node); - sp->root = tree->rb_node; - sp->pptr = pptr; - sp->pid = pid; + BUG_ON(!hprio->row.rb_node); + sp->root = hprio->row.rb_node; + sp->pptr = &hprio->ptr; + sp->pid = &hprio->last_ptr_id; for (i = 0; i < 65535; i++) { if (!*sp->pptr && *sp->pid) { @@ -768,12 +786,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, } } else { struct htb_class *cl; + struct htb_prio *clp; + cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); if (!cl->level) return cl; - (++sp)->root = cl->un.inner.feed[prio].rb_node; - sp->pptr = cl->un.inner.ptr + prio; - sp->pid = cl->un.inner.last_ptr_id + prio; + clp = &cl->un.inner.clprio[prio]; + (++sp)->root = clp->feed.rb_node; + sp->pptr = &clp->ptr; + sp->pid = &clp->last_ptr_id; } } WARN_ON(1); @@ -783,15 +804,16 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, /* dequeues packet at given priority and level; call only if * you are sure that there is active class at prio/level */ -static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio, - int level) +static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio, + const int level) { struct sk_buff *skb = NULL; struct htb_class *cl, *start; + struct htb_level *hlevel = &q->hlevel[level]; + struct htb_prio *hprio = &hlevel->hprio[prio]; + /* look initial class up in the row */ - start = cl = htb_lookup_leaf(q->row[level] + prio, prio, - q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + start = cl = htb_lookup_leaf(hprio, prio); do { next: @@ -811,9 +833,7 @@ next: if ((q->row_mask[level] & (1 << prio)) == 0) return NULL; - next = htb_lookup_leaf(q->row[level] + prio, - prio, q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + next = htb_lookup_leaf(hprio, prio); if (cl == start) /* fix start if we just deleted it */ start = next; @@ -826,11 +846,9 @@ next: break; qdisc_warn_nonwc("htb", cl->un.leaf.q); - htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> - ptr[0]) + prio); - cl = htb_lookup_leaf(q->row[level] + prio, prio, - q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: + &q->hlevel[0].hprio[prio].ptr); + cl = htb_lookup_leaf(hprio, prio); } while (cl != start); @@ -839,8 +857,8 @@ next: cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); if (cl->un.leaf.deficit[level] < 0) { cl->un.leaf.deficit[level] += cl->quantum; - htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> - ptr[0]) + prio); + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : + &q->hlevel[0].hprio[prio].ptr); } /* this used to be after charge_class but this constelation * gives us slightly better performance @@ -880,15 +898,14 @@ ok: for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ int m; - s64 event; + s64 event = q->near_ev_cache[level]; - if (q->now >= q->near_ev_cache[level]) { + if (q->now >= event) { event = htb_do_events(q, level, start_at); if (!event) event = q->now + NSEC_PER_SEC; q->near_ev_cache[level] = event; - } else - event = q->near_ev_cache[level]; + } if (next_event > event) next_event = event; @@ -968,10 +985,8 @@ static void htb_reset(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; - memset(q->row, 0, sizeof(q->row)); + memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); - memset(q->wait_pq, 0, sizeof(q->wait_pq)); - memset(q->ptr, 0, sizeof(q->ptr)); for (i = 0; i < TC_HTB_NUMPRIO; i++) INIT_LIST_HEAD(q->drops + i); } @@ -982,6 +997,8 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, + [TCA_HTB_RATE64] = { .type = NLA_U64 }, + [TCA_HTB_CEIL64] = { .type = NLA_U64 }, }; static void htb_work_func(struct work_struct *work) @@ -1099,6 +1116,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, opt.level = cl->level; if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) goto nla_put_failure; + if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps)) + goto nla_put_failure; + if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) + goto nla_put_failure; nla_nest_end(skb, nest); spin_unlock_bh(root_lock); @@ -1192,7 +1215,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); if (parent->cmode != HTB_CAN_SEND) - htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level); + htb_safe_rb_erase(&parent->pq_node, + &q->hlevel[parent->level].wait_pq); parent->level = 0; memset(&parent->un.inner, 0, sizeof(parent->un.inner)); @@ -1281,7 +1305,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + htb_safe_rb_erase(&cl->pq_node, + &q->hlevel[cl->level].wait_pq); if (last_child) htb_parent_to_leaf(q, cl, new_q); @@ -1312,8 +1337,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; + u64 rate64, ceil64; /* extract all subattrs from opt attr */ if (!opt) @@ -1333,6 +1360,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + /* Keeping backward compatible with rate_table based iproute2 tc */ + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) { + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); + if (rtab) + qdisc_put_rtab(rtab); + } + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) { + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); + if (ctab) + qdisc_put_rtab(ctab); + } + if (!cl) { /* new class */ struct Qdisc *new_q; int prio; @@ -1366,12 +1405,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; - err = gen_new_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE] ? : &est.nla); - if (err) { - kfree(cl); - goto failure; + if (htb_rate_est || tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE] ? : &est.nla); + if (err) { + kfree(cl); + goto failure; + } } cl->refcnt = 1; @@ -1401,7 +1442,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { - htb_safe_rb_erase(&parent->pq_node, q->wait_pq); + htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq); parent->cmode = HTB_CAN_SEND; } parent->level = (parent->parent ? parent->parent->level @@ -1459,11 +1500,15 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - psched_ratecfg_precompute(&cl->rate, &hopt->rate); - psched_ratecfg_precompute(&cl->ceil, &hopt->ceil); + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + + psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); + psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); cl->buffer = PSCHED_TICKS2NS(hopt->buffer); - cl->cbuffer = PSCHED_TICKS2NS(hopt->buffer); + cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); sch_tree_unlock(sch); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 5da78a1..2e56185 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (qdisc == NULL) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index accec33..d44c868 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); if (qdisc == NULL) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 3d2acc7..bccd52b 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -23,6 +23,7 @@ #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <linux/reciprocal_div.h> +#include <linux/rbtree.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -68,7 +69,8 @@ */ struct netem_sched_data { - /* internal t(ime)fifo qdisc uses sch->q and sch->limit */ + /* internal t(ime)fifo qdisc uses t_root and sch->limit */ + struct rb_root t_root; /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; @@ -128,10 +130,35 @@ struct netem_sched_data { */ struct netem_skb_cb { psched_time_t time_to_send; + ktime_t tstamp_save; }; +/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp + * to hold a rb_node structure. + * + * If struct sk_buff layout is changed, the following checks will complain. + */ +static struct rb_node *netem_rb_node(struct sk_buff *skb) +{ + BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0); + BUILD_BUG_ON(offsetof(struct sk_buff, prev) != + offsetof(struct sk_buff, next) + sizeof(skb->next)); + BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) != + offsetof(struct sk_buff, prev) + sizeof(skb->prev)); + BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) + + sizeof(skb->prev) + + sizeof(skb->tstamp)); + return (struct rb_node *)&skb->next; +} + +static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) +{ + return (struct sk_buff *)rb; +} + static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { + /* we assume we can use skb next/prev/tstamp as storage for rb_node */ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } @@ -188,10 +215,10 @@ static bool loss_4state(struct netem_sched_data *q) if (rnd < clg->a4) { clg->state = 4; return true; - } else if (clg->a4 < rnd && rnd < clg->a1) { + } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { clg->state = 3; return true; - } else if (clg->a1 < rnd) + } else if (clg->a1 + clg->a4 < rnd) clg->state = 1; break; @@ -208,7 +235,6 @@ static bool loss_4state(struct netem_sched_data *q) clg->state = 2; else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { clg->state = 1; - return true; } else if (clg->a2 + clg->a3 < rnd) { clg->state = 3; return true; @@ -242,10 +268,11 @@ static bool loss_gilb_ell(struct netem_sched_data *q) clg->state = 2; if (net_random() < clg->a4) return true; + break; case 2: if (net_random() < clg->a2) clg->state = 1; - if (clg->a3 > net_random()) + if (net_random() > clg->a3) return true; } @@ -331,22 +358,40 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche return PSCHED_NS2TICKS(ticks); } +static void tfifo_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + while ((p = rb_first(&q->t_root))) { + struct sk_buff *skb = netem_rb_to_skb(p); + + rb_erase(p, &q->t_root); + skb->next = NULL; + skb->prev = NULL; + kfree_skb(skb); + } +} + static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { - struct sk_buff_head *list = &sch->q; + struct netem_sched_data *q = qdisc_priv(sch); psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; - struct sk_buff *skb = skb_peek_tail(list); + struct rb_node **p = &q->t_root.rb_node, *parent = NULL; - /* Optimize for add at tail */ - if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send)) - return __skb_queue_tail(list, nskb); + while (*p) { + struct sk_buff *skb; - skb_queue_reverse_walk(list, skb) { + parent = *p; + skb = netem_rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) - break; + p = &parent->rb_right; + else + p = &parent->rb_left; } - - __skb_queue_after(list, skb, nskb); + rb_link_node(netem_rb_node(nskb), parent, p); + rb_insert_color(netem_rb_node(nskb), &q->t_root); + sch->q.qlen++; } /* @@ -382,12 +427,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* If a delay is expected, orphan the skb. (orphaning usually takes * place at TX completion time, so _before_ the link transit delay) - * Ideally, this orphaning should be done after the rate limiting - * module, because this breaks TCP Small Queue, and other mechanisms - * based on socket sk_wmem_alloc. */ if (q->latency || q->jitter) - skb_orphan(skb); + skb_orphan_partial(skb); /* * If we need to duplicate packet, then re-insert at top of the @@ -436,23 +478,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) now = psched_get_time(); if (q->rate) { - struct sk_buff_head *list = &sch->q; + struct sk_buff *last; - if (!skb_queue_empty(list)) { + if (!skb_queue_empty(&sch->q)) + last = skb_peek_tail(&sch->q); + else + last = netem_rb_to_skb(rb_last(&q->t_root)); + if (last) { /* * Last packet in queue is reference point (now), * calculate this time bonus and subtract * from delay. */ - delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now; + delay -= netem_skb_cb(last)->time_to_send - now; delay = max_t(psched_tdiff_t, 0, delay); - now = netem_skb_cb(skb_peek_tail(list))->time_to_send; + now = netem_skb_cb(last)->time_to_send; } delay += packet_len_2_sched_time(skb->len, q); } cb->time_to_send = now + delay; + cb->tstamp_save = skb->tstamp; ++q->counter; tfifo_enqueue(skb, sch); } else { @@ -476,6 +523,22 @@ static unsigned int netem_drop(struct Qdisc *sch) unsigned int len; len = qdisc_queue_drop(sch); + + if (!len) { + struct rb_node *p = rb_first(&q->t_root); + + if (p) { + struct sk_buff *skb = netem_rb_to_skb(p); + + rb_erase(p, &q->t_root); + sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + len = qdisc_pkt_len(skb); + sch->qstats.backlog -= len; + kfree_skb(skb); + } + } if (!len && q->qdisc && q->qdisc->ops->drop) len = q->qdisc->ops->drop(q->qdisc); if (len) @@ -488,19 +551,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; + struct rb_node *p; if (qdisc_is_throttled(sch)) return NULL; tfifo_dequeue: - skb = qdisc_peek_head(sch); + skb = __skb_dequeue(&sch->q); if (skb) { - const struct netem_skb_cb *cb = netem_skb_cb(skb); +deliver: + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); + return skb; + } + p = rb_first(&q->t_root); + if (p) { + psched_time_t time_to_send; + + skb = netem_rb_to_skb(p); /* if more time remaining? */ - if (cb->time_to_send <= psched_get_time()) { - __skb_unlink(skb, &sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + time_to_send = netem_skb_cb(skb)->time_to_send; + if (time_to_send <= psched_get_time()) { + rb_erase(p, &q->t_root); + + sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + skb->tstamp = netem_skb_cb(skb)->tstamp_save; #ifdef CONFIG_NET_CLS_ACT /* @@ -522,10 +601,7 @@ tfifo_dequeue: } goto tfifo_dequeue; } -deliver: - qdisc_unthrottled(sch); - qdisc_bstats_update(sch, skb); - return skb; + goto deliver; } if (q->qdisc) { @@ -533,7 +609,7 @@ deliver: if (skb) goto deliver; } - qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); + qdisc_watchdog_schedule(&q->watchdog, time_to_send); } if (q->qdisc) { @@ -549,6 +625,7 @@ static void netem_reset(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); + tfifo_reset(sch); if (q->qdisc) qdisc_reset(q->qdisc); qdisc_watchdog_cancel(&q->watchdog); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d51852b..8056fb4 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -113,7 +113,6 @@ #define FRAC_BITS 30 /* fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) -#define IWSUM (ONE_FP/QFQ_MAX_WSUM) #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ @@ -138,7 +137,7 @@ struct qfq_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ @@ -189,6 +188,7 @@ struct qfq_sched { struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ u32 num_active_agg; /* Num. of active aggregates */ u32 wsum; /* weight sum */ + u32 iwsum; /* inverse weight sum */ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ @@ -314,6 +314,7 @@ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, q->wsum += (int) agg->class_weight * (new_num_classes - agg->num_classes); + q->iwsum = ONE_FP / q->wsum; agg->num_classes = new_num_classes; } @@ -340,6 +341,10 @@ static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { if (!hlist_unhashed(&agg->nonfull_next)) hlist_del_init(&agg->nonfull_next); + q->wsum -= agg->class_weight; + if (q->wsum != 0) + q->iwsum = ONE_FP / q->wsum; + if (q->in_serv_agg == agg) q->in_serv_agg = qfq_choose_next_agg(q); kfree(agg); @@ -821,44 +826,73 @@ static void qfq_make_eligible(struct qfq_sched *q) unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { - unsigned long mask = (1ULL << fls(vslot ^ old_vslot)) - 1; + unsigned long mask; + int last_flip_pos = fls(vslot ^ old_vslot); + + if (last_flip_pos > 31) /* higher than the number of groups */ + mask = ~0UL; /* make all groups eligible */ + else + mask = (1UL << last_flip_pos) - 1; + qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } } - /* - * The index of the slot in which the aggregate is to be inserted must - * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1' - * because the start time of the group may be moved backward by one - * slot after the aggregate has been inserted, and this would cause - * non-empty slots to be right-shifted by one position. + * The index of the slot in which the input aggregate agg is to be + * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' + * and not a '-1' because the start time of the group may be moved + * backward by one slot after the aggregate has been inserted, and + * this would cause non-empty slots to be right-shifted by one + * position. * - * If the weight and lmax (max_pkt_size) of the classes do not change, - * then QFQ+ does meet the above contraint according to the current - * values of its parameters. In fact, if the weight and lmax of the - * classes do not change, then, from the theory, QFQ+ guarantees that - * the slot index is never higher than - * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * - * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18 + * QFQ+ fully satisfies this bound to the slot index if the parameters + * of the classes are not changed dynamically, and if QFQ+ never + * happens to postpone the service of agg unjustly, i.e., it never + * happens that the aggregate becomes backlogged and eligible, or just + * eligible, while an aggregate with a higher approximated finish time + * is being served. In particular, in this case QFQ+ guarantees that + * the timestamps of agg are low enough that the slot index is never + * higher than 2. Unfortunately, QFQ+ cannot provide the same + * guarantee if it happens to unjustly postpone the service of agg, or + * if the parameters of some class are changed. * - * When the weight of a class is increased or the lmax of the class is - * decreased, a new aggregate with smaller slot size than the original - * parent aggregate of the class may happen to be activated. The - * activation of this aggregate should be properly delayed to when the - * service of the class has finished in the ideal system tracked by - * QFQ+. If the activation of the aggregate is not delayed to this - * reference time instant, then this aggregate may be unjustly served - * before other aggregates waiting for service. This may cause the - * above bound to the slot index to be violated for some of these - * unlucky aggregates. + * As for the first event, i.e., an out-of-order service, the + * upper bound to the slot index guaranteed by QFQ+ grows to + * 2 + + * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * + * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. + * + * The following function deals with this problem by backward-shifting + * the timestamps of agg, if needed, so as to guarantee that the slot + * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may + * cause the service of other aggregates to be postponed, yet the + * worst-case guarantees of these aggregates are not violated. In + * fact, in case of no out-of-order service, the timestamps of agg + * would have been even lower than they are after the backward shift, + * because QFQ+ would have guaranteed a maximum value equal to 2 for + * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose + * service is postponed because of the backward-shift would have + * however waited for the service of agg before being served. + * + * The other event that may cause the slot index to be higher than 2 + * for agg is a recent change of the parameters of some class. If the + * weight of a class is increased or the lmax (max_pkt_size) of the + * class is decreased, then a new aggregate with smaller slot size + * than the original parent aggregate of the class may happen to be + * activated. The activation of this aggregate should be properly + * delayed to when the service of the class has finished in the ideal + * system tracked by QFQ+. If the activation of the aggregate is not + * delayed to this reference time instant, then this aggregate may be + * unjustly served before other aggregates waiting for service. This + * may cause the above bound to the slot index to be violated for some + * of these unlucky aggregates. * * Instead of delaying the activation of the new aggregate, which is - * quite complex, the following inaccurate but simple solution is used: - * if the slot index is higher than QFQ_MAX_SLOTS-2, then the - * timestamps of the aggregate are shifted backward so as to let the - * slot index become equal to QFQ_MAX_SLOTS-2. + * quite complex, the above-discussed capping of the slot index is + * used to handle also the consequences of a change of the parameters + * of a class. */ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, u64 roundedS) @@ -1003,9 +1037,61 @@ static inline void charge_actual_service(struct qfq_aggregate *agg) agg->F = agg->S + (u64)service_received * agg->inv_w; } -static inline void qfq_update_agg_ts(struct qfq_sched *q, - struct qfq_aggregate *agg, - enum update_reason reason); +/* Assign a reasonable start time for a new aggregate in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in EB (see [2]). So, if we have groups in ER, + * set S to the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + unsigned long mask; + u64 limit, roundedF; + int slot_shift = agg->grp->slot_shift; + + roundedF = qfq_round_down(agg->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); + + if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], agg->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + if (qfq_gt(limit, next->F)) + agg->S = next->F; + else /* preserve timestamp correctness */ + agg->S = limit; + return; + } + } + agg->S = q->V; + } else /* timestamp is not stale */ + agg->S = agg->F; +} + +/* Update the timestamps of agg before scheduling/rescheduling it for + * service. In particular, assign to agg->F its maximum possible + * value, i.e., the virtual finish time with which the aggregate + * should be labeled if it used all its budget once in service. + */ +static inline void +qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, enum update_reason reason) +{ + if (reason != requeue) + qfq_update_start(q, agg); + else /* just charge agg for the service received */ + agg->S = agg->F; + + agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; +} static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); @@ -1077,7 +1163,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch) else in_serv_agg->budget -= len; - q->V += (u64)len * IWSUM; + q->V += (u64)len * q->iwsum; pr_debug("qfq dequeue: len %u F %lld now %lld\n", len, (unsigned long long) in_serv_agg->F, (unsigned long long) q->V); @@ -1128,66 +1214,6 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) return agg; } -/* - * Assign a reasonable start time for a new aggregate in group i. - * Admissible values for \hat(F) are multiples of \sigma_i - * no greater than V+\sigma_i . Larger values mean that - * we had a wraparound so we consider the timestamp to be stale. - * - * If F is not stale and F >= V then we set S = F. - * Otherwise we should assign S = V, but this may violate - * the ordering in EB (see [2]). So, if we have groups in ER, - * set S to the F_j of the first group j which would be blocking us. - * We are guaranteed not to move S backward because - * otherwise our group i would still be blocked. - */ -static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) -{ - unsigned long mask; - u64 limit, roundedF; - int slot_shift = agg->grp->slot_shift; - - roundedF = qfq_round_down(agg->F, slot_shift); - limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); - - if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { - /* timestamp was stale */ - mask = mask_from(q->bitmaps[ER], agg->grp->index); - if (mask) { - struct qfq_group *next = qfq_ffs(q, mask); - if (qfq_gt(roundedF, next->F)) { - if (qfq_gt(limit, next->F)) - agg->S = next->F; - else /* preserve timestamp correctness */ - agg->S = limit; - return; - } - } - agg->S = q->V; - } else /* timestamp is not stale */ - agg->S = agg->F; -} - -/* - * Update the timestamps of agg before scheduling/rescheduling it for - * service. In particular, assign to agg->F its maximum possible - * value, i.e., the virtual finish time with which the aggregate - * should be labeled if it used all its budget once in service. - */ -static inline void -qfq_update_agg_ts(struct qfq_sched *q, - struct qfq_aggregate *agg, enum update_reason reason) -{ - if (reason != requeue) - qfq_update_start(q, agg); - else /* just charge agg for the service received */ - agg->S = agg->F; - - agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; -} - -static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *); - static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 3811e8e..d166360 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -21,6 +21,7 @@ #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> +#include <net/tcp.h> /* Simple Token Bucket Filter. @@ -129,14 +130,69 @@ struct tbf_sched_data { struct qdisc_watchdog watchdog; /* Watchdog timer */ }; + +/* + * Return length of individual segments of a gso packet, + * including all headers (MAC, IP, TCP/UDP) + */ +static unsigned int skb_gso_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + const struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + hdr_len += tcp_hdrlen(skb); + else + hdr_len += sizeof(struct udphdr); + return hdr_len + shinfo->gso_size; +} + +/* GSO packet is too big, segment it so that tbf can transmit + * each segment in time + */ +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + int ret, nb; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) + return qdisc_reshape_fail(skb, sch); + + nb = 0; + while (segs) { + nskb = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + ret = qdisc_enqueue(segs, q->qdisc); + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + } else { + nb++; + } + segs = nskb; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_decrease_qlen(sch, 1 - nb); + consume_skb(skb); + return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; - if (qdisc_pkt_len(skb) > q->max_size) + if (qdisc_pkt_len(skb) > q->max_size) { + if (skb_is_gso(skb) && skb_gso_seglen(skb) <= q->max_size) + return tbf_segment(skb, sch); return qdisc_reshape_fail(skb, sch); - + } ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) @@ -236,20 +292,23 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_TBF_RATE64] = { .type = NLA_U64 }, + [TCA_TBF_PRATE64] = { .type = NLA_U64 }, }; static int tbf_change(struct Qdisc *sch, struct nlattr *opt) { int err; struct tbf_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_TBF_PTAB + 1]; + struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; struct qdisc_rate_table *rtab = NULL; struct qdisc_rate_table *ptab = NULL; struct Qdisc *child = NULL; int max_size, n; + u64 rate64 = 0, prate64 = 0; - err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); + err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy); if (err < 0) return err; @@ -286,6 +345,11 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) if (max_size < 0) goto done; + if (max_size < psched_mtu(qdisc_dev(sch))) + pr_warn_ratelimited("sch_tbf: burst %u is lower than device %s mtu (%u) !\n", + max_size, qdisc_dev(sch)->name, + psched_mtu(qdisc_dev(sch))); + if (q->qdisc != &noop_qdisc) { err = fifo_set_limit(q->qdisc, qopt->limit); if (err) @@ -311,9 +375,13 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->tokens = q->buffer; q->ptokens = q->mtu; - psched_ratecfg_precompute(&q->rate, &rtab->rate); + if (tb[TCA_TBF_RATE64]) + rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); + psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64); if (ptab) { - psched_ratecfg_precompute(&q->peak, &ptab->rate); + if (tb[TCA_TBF_PRATE64]) + prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); + psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64); q->peak_present = true; } else { q->peak_present = false; @@ -380,6 +448,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) opt.buffer = PSCHED_NS2TICKS(q->buffer); if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; + if (q->rate.rate_bytes_ps >= (1ULL << 32) && + nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps)) + goto nla_put_failure; + if (q->peak_present && + q->peak.rate_bytes_ps >= (1ULL << 32) && + nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps)) + goto nla_put_failure; nla_nest_end(skb, nest); return skb->len; |