summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2013-12-14 01:15:24 (GMT)
committerScott Wood <scottwood@freescale.com>2013-12-14 01:15:24 (GMT)
commitb7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b (patch)
tree87828aaf5f82c7042bfc0307bc4ac499e10f93fb /net/sched
parent22c782a4b14773fab7eab3c1db54ad7ad077e9b8 (diff)
parent78fd82238d0e5716578c326404184a27ba67fd6e (diff)
downloadlinux-fsl-qoriq-b7c81aa3ab2ac2c140e278b6d0e9a0b95112cf0b.tar.xz
Merge remote-tracking branch 'linus/master' into merge
Conflicts: Documentation/hwmon/ina2xx arch/powerpc/Kconfig arch/powerpc/boot/dts/b4860emu.dts arch/powerpc/boot/dts/b4qds.dtsi arch/powerpc/boot/dts/fsl/b4si-post.dtsi arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi arch/powerpc/boot/dts/p1023rdb.dts arch/powerpc/boot/dts/t4240emu.dts arch/powerpc/boot/dts/t4240qds.dts arch/powerpc/configs/85xx/p1023_defconfig arch/powerpc/configs/corenet32_smp_defconfig arch/powerpc/configs/corenet64_smp_defconfig arch/powerpc/configs/mpc85xx_smp_defconfig arch/powerpc/include/asm/cputable.h arch/powerpc/include/asm/device.h arch/powerpc/include/asm/epapr_hcalls.h arch/powerpc/include/asm/kvm_host.h arch/powerpc/include/asm/mpic.h arch/powerpc/include/asm/pci.h arch/powerpc/include/asm/ppc-opcode.h arch/powerpc/include/asm/ppc_asm.h arch/powerpc/include/asm/reg_booke.h arch/powerpc/kernel/epapr_paravirt.c arch/powerpc/kernel/process.c arch/powerpc/kernel/prom.c arch/powerpc/kernel/setup-common.c arch/powerpc/kernel/setup_32.c arch/powerpc/kernel/setup_64.c arch/powerpc/kernel/smp.c arch/powerpc/kernel/swsusp_asm64.S arch/powerpc/kernel/swsusp_booke.S arch/powerpc/kvm/book3s_pr.c arch/powerpc/kvm/booke.c arch/powerpc/kvm/booke.h arch/powerpc/kvm/e500.c arch/powerpc/kvm/e500.h arch/powerpc/kvm/e500_emulate.c arch/powerpc/kvm/e500mc.c arch/powerpc/kvm/powerpc.c arch/powerpc/perf/e6500-pmu.c arch/powerpc/platforms/85xx/Kconfig arch/powerpc/platforms/85xx/Makefile arch/powerpc/platforms/85xx/b4_qds.c arch/powerpc/platforms/85xx/c293pcie.c arch/powerpc/platforms/85xx/corenet_ds.c arch/powerpc/platforms/85xx/corenet_ds.h arch/powerpc/platforms/85xx/p1023_rds.c arch/powerpc/platforms/85xx/p2041_rdb.c arch/powerpc/platforms/85xx/p3041_ds.c arch/powerpc/platforms/85xx/p4080_ds.c arch/powerpc/platforms/85xx/p5020_ds.c arch/powerpc/platforms/85xx/p5040_ds.c arch/powerpc/platforms/85xx/smp.c arch/powerpc/platforms/85xx/t4240_qds.c arch/powerpc/platforms/Kconfig arch/powerpc/sysdev/Makefile arch/powerpc/sysdev/fsl_mpic_timer_wakeup.c arch/powerpc/sysdev/fsl_msi.c arch/powerpc/sysdev/fsl_pci.c arch/powerpc/sysdev/fsl_pci.h arch/powerpc/sysdev/fsl_soc.h arch/powerpc/sysdev/mpic.c arch/powerpc/sysdev/mpic_timer.c drivers/Kconfig drivers/clk/Kconfig drivers/clk/clk-ppc-corenet.c drivers/cpufreq/Kconfig.powerpc drivers/cpufreq/Makefile drivers/cpufreq/ppc-corenet-cpufreq.c drivers/crypto/caam/Kconfig drivers/crypto/caam/Makefile drivers/crypto/caam/ctrl.c drivers/crypto/caam/desc_constr.h drivers/crypto/caam/intern.h drivers/crypto/caam/jr.c drivers/crypto/caam/regs.h drivers/dma/fsldma.c drivers/hwmon/ina2xx.c drivers/iommu/Kconfig drivers/iommu/fsl_pamu.c drivers/iommu/fsl_pamu.h drivers/iommu/fsl_pamu_domain.c drivers/iommu/fsl_pamu_domain.h drivers/misc/Makefile drivers/mmc/card/block.c drivers/mmc/core/core.c drivers/mmc/host/sdhci-esdhc.h drivers/mmc/host/sdhci-pltfm.c drivers/mtd/nand/fsl_ifc_nand.c drivers/net/ethernet/freescale/gianfar.c drivers/net/ethernet/freescale/gianfar.h drivers/net/ethernet/freescale/gianfar_ethtool.c drivers/net/phy/at803x.c drivers/net/phy/phy_device.c drivers/net/phy/vitesse.c drivers/pci/msi.c drivers/staging/Kconfig drivers/staging/Makefile drivers/uio/Kconfig drivers/uio/Makefile drivers/uio/uio.c drivers/usb/host/ehci-fsl.c drivers/vfio/Kconfig drivers/vfio/Makefile include/crypto/algapi.h include/linux/iommu.h include/linux/mmc/sdhci.h include/linux/msi.h include/linux/netdev_features.h include/linux/phy.h include/linux/skbuff.h include/net/ip.h include/uapi/linux/vfio.h net/core/ethtool.c net/ipv4/route.c net/ipv6/route.c
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig24
-rw-r--r--net/sched/Makefile2
-rw-r--r--net/sched/act_mirred.c2
-rw-r--r--net/sched/act_police.c4
-rw-r--r--net/sched/cls_basic.c2
-rw-r--r--net/sched/cls_bpf.c385
-rw-r--r--net/sched/cls_cgroup.c43
-rw-r--r--net/sched/em_ipset.c7
-rw-r--r--net/sched/em_meta.c4
-rw-r--r--net/sched/sch_api.c97
-rw-r--r--net/sched/sch_atm.c1
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_choke.c3
-rw-r--r--net/sched/sch_drr.c2
-rw-r--r--net/sched/sch_fq.c822
-rw-r--r--net/sched/sch_generic.c83
-rw-r--r--net/sched/sch_hfsc.c2
-rw-r--r--net/sched/sch_htb.c293
-rw-r--r--net/sched/sch_mq.c2
-rw-r--r--net/sched/sch_mqprio.c2
-rw-r--r--net/sched/sch_netem.c141
-rw-r--r--net/sched/sch_qfq.c214
-rw-r--r--net/sched/sch_tbf.c87
23 files changed, 1898 insertions, 327 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01a..ad1f1d8 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL
If unsure, say N.
+config NET_SCH_FQ
+ tristate "Fair Queue"
+ help
+ Say Y here if you want to use the FQ packet scheduling algorithm.
+
+ FQ does flow separation, and is able to respect pacing requirements
+ set by TCP stack into sk->sk_pacing_rate (for localy generated
+ traffic)
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
@@ -429,6 +443,16 @@ config NET_CLS_CGROUP
To compile this code as a module, choose M here: the
module will be called cls_cgroup.
+config NET_CLS_BPF
+ tristate "BPF-based classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ programmable BPF (JIT'ed) filters as an alternative to ematches.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_bpf.
+
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 978cbf0..35fa47a 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
@@ -49,6 +50,7 @@ obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
+obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5d676ed..977c10e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -243,7 +243,7 @@ nla_put_failure:
static int mirred_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tcf_mirred *m;
if (event == NETDEV_UNREGISTER)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 189e3c5..272d8e9 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -231,14 +231,14 @@ override:
}
if (R_tab) {
police->rate_present = true;
- psched_ratecfg_precompute(&police->rate, &R_tab->rate);
+ psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
qdisc_put_rtab(R_tab);
} else {
police->rate_present = false;
}
if (P_tab) {
police->peak_present = true;
- psched_ratecfg_precompute(&police->peak, &P_tab->rate);
+ psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
qdisc_put_rtab(P_tab);
} else {
police->peak_present = false;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index d76a35d..636d913 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -137,7 +137,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
struct nlattr **tb,
struct nlattr *est)
{
- int err = -EINVAL;
+ int err;
struct tcf_exts e;
struct tcf_ematch_tree t;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
new file mode 100644
index 0000000..1002a82
--- /dev/null
+++ b/net/sched/cls_bpf.c
@@ -0,0 +1,385 @@
+/*
+ * Berkeley Packet Filter based traffic classifier
+ *
+ * Might be used to classify traffic through flexible, user-defined and
+ * possibly JIT-ed BPF filters for traffic control as an alternative to
+ * ematches.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("TC BPF based classifier");
+
+struct cls_bpf_head {
+ struct list_head plist;
+ u32 hgen;
+};
+
+struct cls_bpf_prog {
+ struct sk_filter *filter;
+ struct sock_filter *bpf_ops;
+ struct tcf_exts exts;
+ struct tcf_result res;
+ struct list_head link;
+ u32 handle;
+ u16 bpf_len;
+};
+
+static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
+ [TCA_BPF_CLASSID] = { .type = NLA_U32 },
+ [TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
+ [TCA_BPF_OPS] = { .type = NLA_BINARY,
+ .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static const struct tcf_ext_map bpf_ext_map = {
+ .action = TCA_BPF_ACT,
+ .police = TCA_BPF_POLICE,
+};
+
+static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog;
+ int ret;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ int filter_res = SK_RUN_FILTER(prog->filter, skb);
+
+ if (filter_res == 0)
+ continue;
+
+ *res = prog->res;
+ if (filter_res != -1)
+ res->classid = filter_res;
+
+ ret = tcf_exts_exec(skb, &prog->exts, res);
+ if (ret < 0)
+ continue;
+
+ return ret;
+ }
+
+ return -1;
+}
+
+static int cls_bpf_init(struct tcf_proto *tp)
+{
+ struct cls_bpf_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ INIT_LIST_HEAD(&head->plist);
+ tp->root = head;
+
+ return 0;
+}
+
+static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+{
+ tcf_unbind_filter(tp, &prog->res);
+ tcf_exts_destroy(tp, &prog->exts);
+
+ sk_unattached_filter_destroy(prog->filter);
+
+ kfree(prog->bpf_ops);
+ kfree(prog);
+}
+
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (prog == todel) {
+ tcf_tree_lock(tp);
+ list_del(&prog->link);
+ tcf_tree_unlock(tp);
+
+ cls_bpf_delete_prog(tp, prog);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static void cls_bpf_destroy(struct tcf_proto *tp)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog, *tmp;
+
+ list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+ list_del(&prog->link);
+ cls_bpf_delete_prog(tp, prog);
+ }
+
+ kfree(head);
+}
+
+static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog;
+ unsigned long ret = 0UL;
+
+ if (head == NULL)
+ return 0UL;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (prog->handle == handle) {
+ ret = (unsigned long) prog;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void cls_bpf_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
+ struct cls_bpf_prog *prog,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est)
+{
+ struct sock_filter *bpf_ops, *bpf_old;
+ struct tcf_exts exts;
+ struct sock_fprog tmp;
+ struct sk_filter *fp, *fp_old;
+ u16 bpf_size, bpf_len;
+ u32 classid;
+ int ret;
+
+ if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
+ return -EINVAL;
+
+ ret = tcf_exts_validate(net, tp, tb, est, &exts, &bpf_ext_map);
+ if (ret < 0)
+ return ret;
+
+ classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+ bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+ if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
+ ret = -EINVAL;
+ goto errout;
+ }
+
+ bpf_size = bpf_len * sizeof(*bpf_ops);
+ bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ if (bpf_ops == NULL) {
+ ret = -ENOMEM;
+ goto errout;
+ }
+
+ memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
+
+ tmp.len = bpf_len;
+ tmp.filter = (struct sock_filter __user *) bpf_ops;
+
+ ret = sk_unattached_filter_create(&fp, &tmp);
+ if (ret)
+ goto errout_free;
+
+ tcf_tree_lock(tp);
+ fp_old = prog->filter;
+ bpf_old = prog->bpf_ops;
+
+ prog->bpf_len = bpf_len;
+ prog->bpf_ops = bpf_ops;
+ prog->filter = fp;
+ prog->res.classid = classid;
+ tcf_tree_unlock(tp);
+
+ tcf_bind_filter(tp, &prog->res, base);
+ tcf_exts_change(tp, &prog->exts, &exts);
+
+ if (fp_old)
+ sk_unattached_filter_destroy(fp_old);
+ if (bpf_old)
+ kfree(bpf_old);
+
+ return 0;
+
+errout_free:
+ kfree(bpf_ops);
+errout:
+ tcf_exts_destroy(tp, &exts);
+ return ret;
+}
+
+static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
+ struct cls_bpf_head *head)
+{
+ unsigned int i = 0x80000000;
+
+ do {
+ if (++head->hgen == 0x7FFFFFFF)
+ head->hgen = 1;
+ } while (--i > 0 && cls_bpf_get(tp, head->hgen));
+ if (i == 0)
+ pr_err("Insufficient number of handles\n");
+
+ return i;
+}
+
+static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ unsigned long *arg)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg;
+ struct nlattr *tb[TCA_BPF_MAX + 1];
+ int ret;
+
+ if (tca[TCA_OPTIONS] == NULL)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
+ if (ret < 0)
+ return ret;
+
+ if (prog != NULL) {
+ if (handle && prog->handle != handle)
+ return -EINVAL;
+ return cls_bpf_modify_existing(net, tp, prog, base, tb,
+ tca[TCA_RATE]);
+ }
+
+ prog = kzalloc(sizeof(*prog), GFP_KERNEL);
+ if (prog == NULL)
+ return -ENOBUFS;
+
+ if (handle == 0)
+ prog->handle = cls_bpf_grab_new_handle(tp, head);
+ else
+ prog->handle = handle;
+ if (prog->handle == 0) {
+ ret = -EINVAL;
+ goto errout;
+ }
+
+ ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE]);
+ if (ret < 0)
+ goto errout;
+
+ tcf_tree_lock(tp);
+ list_add(&prog->link, &head->plist);
+ tcf_tree_unlock(tp);
+
+ *arg = (unsigned long) prog;
+
+ return 0;
+errout:
+ if (*arg == 0UL && prog)
+ kfree(prog);
+
+ return ret;
+}
+
+static int cls_bpf_dump(struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *tm)
+{
+ struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
+ struct nlattr *nest, *nla;
+
+ if (prog == NULL)
+ return skb->len;
+
+ tm->tcm_handle = prog->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
+ sizeof(struct sock_filter));
+ if (nla == NULL)
+ goto nla_put_failure;
+
+ memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+ if (tcf_exts_dump(skb, &prog->exts, &bpf_ext_map) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &prog->exts, &bpf_ext_map) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_prog *prog;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
+ .kind = "bpf",
+ .owner = THIS_MODULE,
+ .classify = cls_bpf_classify,
+ .init = cls_bpf_init,
+ .destroy = cls_bpf_destroy,
+ .get = cls_bpf_get,
+ .put = cls_bpf_put,
+ .change = cls_bpf_change,
+ .delete = cls_bpf_delete,
+ .walk = cls_bpf_walk,
+ .dump = cls_bpf_dump,
+};
+
+static int __init cls_bpf_init_mod(void)
+{
+ return register_tcf_proto_ops(&cls_bpf_ops);
+}
+
+static void __exit cls_bpf_exit_mod(void)
+{
+ unregister_tcf_proto_ops(&cls_bpf_ops);
+}
+
+module_init(cls_bpf_init_mod);
+module_exit(cls_bpf_exit_mod);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 3a294eb..16006c9 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,19 +23,18 @@
#include <net/sock.h>
#include <net/cls_cgroup.h>
-static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
+static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
- struct cgroup_cls_state, css);
+ return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
}
static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
- return container_of(task_subsys_state(p, net_cls_subsys_id),
- struct cgroup_cls_state, css);
+ return css_cls_state(task_css(p, net_cls_subsys_id));
}
-static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_cls_state *cs;
@@ -45,17 +44,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
return &cs->css;
}
-static int cgrp_css_online(struct cgroup *cgrp)
+static int cgrp_css_online(struct cgroup_subsys_state *css)
{
- if (cgrp->parent)
- cgrp_cls_state(cgrp)->classid =
- cgrp_cls_state(cgrp->parent)->classid;
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct cgroup_cls_state *parent = css_cls_state(css_parent(css));
+
+ if (parent)
+ cs->classid = parent->classid;
return 0;
}
-static void cgrp_css_free(struct cgroup *cgrp)
+static void cgrp_css_free(struct cgroup_subsys_state *css)
{
- kfree(cgrp_cls_state(cgrp));
+ kfree(css_cls_state(css));
}
static int update_classid(const void *v, struct file *file, unsigned n)
@@ -67,27 +68,29 @@ static int update_classid(const void *v, struct file *file, unsigned n)
return 0;
}
-static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static void cgrp_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
struct task_struct *p;
- void *v;
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ void *v = (void *)(unsigned long)cs->classid;
- cgroup_taskset_for_each(p, cgrp, tset) {
+ cgroup_taskset_for_each(p, css, tset) {
task_lock(p);
- v = (void *)(unsigned long)task_cls_classid(p);
iterate_fd(p->files, 0, update_classid, v);
task_unlock(p);
}
}
-static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
+static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return cgrp_cls_state(cgrp)->classid;
+ return css_cls_state(css)->classid;
}
-static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
+static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 value)
{
- cgrp_cls_state(cgrp)->classid = (u32) value;
+ css_cls_state(css)->classid = (u32) value;
return 0;
}
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index 938b7cb..527aeb7 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -24,11 +24,12 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,
{
struct xt_set_info *set = data;
ip_set_id_t index;
+ struct net *net = dev_net(qdisc_dev(tp->q));
if (data_len != sizeof(*set))
return -EINVAL;
- index = ip_set_nfnl_get_byindex(set->index);
+ index = ip_set_nfnl_get_byindex(net, set->index);
if (index == IPSET_INVALID_ID)
return -ENOENT;
@@ -37,7 +38,7 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,
if (em->data)
return 0;
- ip_set_nfnl_put(index);
+ ip_set_nfnl_put(net, index);
return -ENOMEM;
}
@@ -45,7 +46,7 @@ static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em)
{
const struct xt_set_info *set = (const void *) em->data;
if (set) {
- ip_set_nfnl_put(set->index);
+ ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index);
kfree((void *) em->data);
}
}
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 7c3de6f..e5cef956 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -793,8 +793,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len,
goto errout;
meta = kzalloc(sizeof(*meta), GFP_KERNEL);
- if (meta == NULL)
+ if (meta == NULL) {
+ err = -ENOMEM;
goto errout;
+ }
memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 281c1bd..cd81505 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops)
}
EXPORT_SYMBOL(unregister_qdisc);
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+ read_lock(&qdisc_mod_lock);
+ strlcpy(name, default_qdisc_ops->id, len);
+ read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+ struct Qdisc_ops *q = NULL;
+
+ for (q = qdisc_base; q; q = q->next) {
+ if (!strcmp(name, q->id)) {
+ if (!try_module_get(q->owner))
+ q = NULL;
+ break;
+ }
+ }
+
+ return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+ const struct Qdisc_ops *ops;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ write_lock(&qdisc_mod_lock);
+ ops = qdisc_lookup_default(name);
+ if (!ops) {
+ /* Not found, drop lock and try to load module */
+ write_unlock(&qdisc_mod_lock);
+ request_module("sch_%s", name);
+ write_lock(&qdisc_mod_lock);
+
+ ops = qdisc_lookup_default(name);
+ }
+
+ if (ops) {
+ /* Set new default */
+ module_put(default_qdisc_ops->owner);
+ default_qdisc_ops = ops;
+ }
+ write_unlock(&qdisc_mod_lock);
+
+ return ops ? 0 : -ENOENT;
+}
+
/* We know handle. Find qdisc among all qdisc's attached to device
(root qdisc, all its children, children of children etc.)
*/
@@ -285,6 +337,45 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
return q;
}
+/* The linklayer setting were not transferred from iproute2, in older
+ * versions, and the rate tables lookup systems have been dropped in
+ * the kernel. To keep backward compatible with older iproute2 tc
+ * utils, we detect the linklayer setting by detecting if the rate
+ * table were modified.
+ *
+ * For linklayer ATM table entries, the rate table will be aligned to
+ * 48 bytes, thus some table entries will contain the same value. The
+ * mpu (min packet unit) is also encoded into the old rate table, thus
+ * starting from the mpu, we find low and high table entries for
+ * mapping this cell. If these entries contain the same value, when
+ * the rate tables have been modified for linklayer ATM.
+ *
+ * This is done by rounding mpu to the nearest 48 bytes cell/entry,
+ * and then roundup to the next cell, calc the table entry one below,
+ * and compare.
+ */
+static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
+{
+ int low = roundup(r->mpu, 48);
+ int high = roundup(low+1, 48);
+ int cell_low = low >> r->cell_log;
+ int cell_high = (high >> r->cell_log) - 1;
+
+ /* rtab is too inaccurate at rates > 100Mbit/s */
+ if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
+ pr_debug("TC linklayer: Giving up ATM detection\n");
+ return TC_LINKLAYER_ETHERNET;
+ }
+
+ if ((cell_high > cell_low) && (cell_high < 256)
+ && (rtab[cell_low] == rtab[cell_high])) {
+ pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
+ cell_low, cell_high, rtab[cell_high]);
+ return TC_LINKLAYER_ATM;
+ }
+ return TC_LINKLAYER_ETHERNET;
+}
+
static struct qdisc_rate_table *qdisc_rtab_list;
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
@@ -308,6 +399,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta
rtab->rate = *r;
rtab->refcnt = 1;
memcpy(rtab->data, nla_data(tab), 1024);
+ if (r->linklayer == TC_LINKLAYER_UNAWARE)
+ r->linklayer = __detect_linklayer(r, rtab->data);
rtab->next = qdisc_rtab_list;
qdisc_rtab_list = rtab;
}
@@ -644,9 +737,11 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
const struct Qdisc_class_ops *cops;
unsigned long cl;
u32 parentid;
+ int drops;
if (n == 0)
return;
+ drops = max_t(int, n, 0);
while ((parentid = sch->parent)) {
if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
return;
@@ -663,6 +758,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
cops->put(sch, cl);
}
sch->q.qlen -= n;
+ sch->qstats.drops += drops;
}
}
EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
@@ -1813,6 +1909,7 @@ static int __init pktsched_init(void)
return err;
}
+ register_qdisc(&pfifo_fast_ops);
register_qdisc(&pfifo_qdisc_ops);
register_qdisc(&bfifo_qdisc_ops);
register_qdisc(&pfifo_head_drop_qdisc_ops);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index ca8e0a5..1f9c314 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -605,6 +605,7 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
struct sockaddr_atmpvc pvc;
int state;
+ memset(&pvc, 0, sizeof(pvc));
pvc.sap_family = AF_ATMPVC;
pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
pvc.sap_addr.vpi = flow->vcc->vpi;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 1bc210f..7a42c81 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -130,7 +130,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto *filter_list;
@@ -1465,6 +1465,7 @@ static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_wrropt opt;
+ memset(&opt, 0, sizeof(opt));
opt.flags = 0;
opt.allot = cl->allot;
opt.priority = cl->priority + 1;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index ef53ab8..ddd73cb 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -438,7 +438,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
if (mask != q->tab_mask) {
struct sk_buff **ntab;
- ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+ ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+ GFP_KERNEL | __GFP_NOWARN);
if (!ntab)
ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
if (!ntab)
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index fb09e88..95cb9e7 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -37,7 +37,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct list_head alist;
struct Qdisc *qdisc;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 0000000..95d8439
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,822 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ * Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Meant to be mostly used for localy generated traffic :
+ * Fast classification depends on skb->sk being set before reaching us.
+ * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ * All packets belonging to a socket are considered as a 'flow'.
+ *
+ * Flows are dynamically allocated and stored in a hash table of RB trees
+ * They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ * Burst avoidance (aka pacing) capability :
+ *
+ * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ * bunch of packets, and this packet scheduler adds delay between
+ * packets to respect rate limitation.
+ *
+ * enqueue() :
+ * - lookup one RB tree (out of 1024 or more) to find the flow.
+ * If non existent flow, create it, add it to the tree.
+ * Add skb to the per flow list of skb (fifo).
+ * - Use a special fifo for high prio packets
+ *
+ * dequeue() : serves flows in Round Robin
+ * Note : When a flow becomes empty, we do not immediately remove it from
+ * rb trees, for performance reasons (its expected to send additional packets,
+ * or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+ struct sk_buff *head; /* list of skbs for this flow : first skb */
+ union {
+ struct sk_buff *tail; /* last skb in the list */
+ unsigned long age; /* jiffies when flow was emptied, for gc */
+ };
+ struct rb_node fq_node; /* anchor in fq_root[] trees */
+ struct sock *sk;
+ int qlen; /* number of packets in flow queue */
+ int credit;
+ u32 socket_hash; /* sk_hash */
+ struct fq_flow *next; /* next pointer in RR lists, or &detached */
+
+ struct rb_node rate_node; /* anchor in q->delayed tree */
+ u64 time_next_packet;
+};
+
+struct fq_flow_head {
+ struct fq_flow *first;
+ struct fq_flow *last;
+};
+
+struct fq_sched_data {
+ struct fq_flow_head new_flows;
+
+ struct fq_flow_head old_flows;
+
+ struct rb_root delayed; /* for rate limited flows */
+ u64 time_next_delayed_flow;
+
+ struct fq_flow internal; /* for non classified or high prio packets */
+ u32 quantum;
+ u32 initial_quantum;
+ u32 flow_refill_delay;
+ u32 flow_max_rate; /* optional max rate per flow */
+ u32 flow_plimit; /* max packets per flow */
+ struct rb_root *fq_root;
+ u8 rate_enable;
+ u8 fq_trees_log;
+
+ u32 flows;
+ u32 inactive_flows;
+ u32 throttled_flows;
+
+ u64 stat_gc_flows;
+ u64 stat_internal_packets;
+ u64 stat_tcp_retrans;
+ u64 stat_throttled;
+ u64 stat_flows_plimit;
+ u64 stat_pkts_too_long;
+ u64 stat_allocation_errors;
+ struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+ f->next = &detached;
+ f->age = jiffies;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+ return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct fq_flow *aux;
+
+ parent = *p;
+ aux = container_of(parent, struct fq_flow, rate_node);
+ if (f->time_next_packet >= aux->time_next_packet)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&f->rate_node, parent, p);
+ rb_insert_color(&f->rate_node, &q->delayed);
+ q->throttled_flows++;
+ q->stat_throttled++;
+
+ f->next = &throttled;
+ if (q->time_next_delayed_flow > f->time_next_packet)
+ q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+ if (head->first)
+ head->last->next = flow;
+ else
+ head->first = flow;
+ head->last = flow;
+ flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+ return fq_flow_is_detached(f) &&
+ time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+ struct rb_root *root,
+ struct sock *sk)
+{
+ struct fq_flow *f, *tofree[FQ_GC_MAX];
+ struct rb_node **p, *parent;
+ int fcnt = 0;
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk)
+ break;
+
+ if (fq_gc_candidate(f)) {
+ tofree[fcnt++] = f;
+ if (fcnt == FQ_GC_MAX)
+ break;
+ }
+
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+ while (fcnt) {
+ struct fq_flow *f = tofree[--fcnt];
+
+ rb_erase(&f->fq_node, root);
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+}
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+ struct rb_node **p, *parent;
+ struct sock *sk = skb->sk;
+ struct rb_root *root;
+ struct fq_flow *f;
+
+ /* warning: no starvation prevention... */
+ if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+ return &q->internal;
+
+ if (unlikely(!sk)) {
+ /* By forcing low order bit to 1, we make sure to not
+ * collide with a local flow (socket pointers are word aligned)
+ */
+ sk = (struct sock *)(skb_get_rxhash(skb) | 1L);
+ }
+
+ root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+ if (q->flows >= (2U << q->fq_trees_log) &&
+ q->inactive_flows > q->flows/2)
+ fq_gc(q, root, sk);
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk) {
+ /* socket might have been reallocated, so check
+ * if its sk_hash is the same.
+ * It not, we need to refill credit with
+ * initial quantum
+ */
+ if (unlikely(skb->sk &&
+ f->socket_hash != sk->sk_hash)) {
+ f->credit = q->initial_quantum;
+ f->socket_hash = sk->sk_hash;
+ f->time_next_packet = 0ULL;
+ }
+ return f;
+ }
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!f)) {
+ q->stat_allocation_errors++;
+ return &q->internal;
+ }
+ fq_flow_set_detached(f);
+ f->sk = sk;
+ if (skb->sk)
+ f->socket_hash = sk->sk_hash;
+ f->credit = q->initial_quantum;
+
+ rb_link_node(&f->fq_node, parent, p);
+ rb_insert_color(&f->fq_node, root);
+
+ q->flows++;
+ q->inactive_flows++;
+ return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+ flow->qlen--;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ sch->q.qlen--;
+ }
+ return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+ return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head-> [retrans pkt 1]
+ * [retrans pkt 2]
+ * [ normal pkt 1]
+ * [ normal pkt 2]
+ * [ normal pkt 3]
+ * tail-> [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *head = flow->head;
+
+ skb->next = NULL;
+ if (!head) {
+ flow->head = skb;
+ flow->tail = skb;
+ return;
+ }
+ if (likely(!skb_is_retransmit(skb))) {
+ flow->tail->next = skb;
+ flow->tail = skb;
+ return;
+ }
+
+ /* This skb is a tcp retransmit,
+ * find the last retrans packet in the queue
+ */
+ prev = NULL;
+ while (skb_is_retransmit(head)) {
+ prev = head;
+ head = head->next;
+ if (!head)
+ break;
+ }
+ if (!prev) { /* no rtx packet in queue, become the new head */
+ skb->next = flow->head;
+ flow->head = skb;
+ } else {
+ if (prev == flow->tail)
+ flow->tail = skb;
+ else
+ skb->next = prev->next;
+ prev->next = skb;
+ }
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct fq_flow *f;
+
+ if (unlikely(sch->q.qlen >= sch->limit))
+ return qdisc_drop(skb, sch);
+
+ f = fq_classify(skb, q);
+ if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+ q->stat_flows_plimit++;
+ return qdisc_drop(skb, sch);
+ }
+
+ f->qlen++;
+ if (skb_is_retransmit(skb))
+ q->stat_tcp_retrans++;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ if (fq_flow_is_detached(f)) {
+ fq_flow_add_tail(&q->new_flows, f);
+ if (time_after(jiffies, f->age + q->flow_refill_delay))
+ f->credit = max_t(u32, f->credit, q->quantum);
+ q->inactive_flows--;
+ qdisc_unthrottled(sch);
+ }
+
+ /* Note: this overwrites f->age */
+ flow_queue_add(f, skb);
+
+ if (unlikely(f == &q->internal)) {
+ q->stat_internal_packets++;
+ qdisc_unthrottled(sch);
+ }
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+ struct rb_node *p;
+
+ if (q->time_next_delayed_flow > now)
+ return;
+
+ q->time_next_delayed_flow = ~0ULL;
+ while ((p = rb_first(&q->delayed)) != NULL) {
+ struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+ if (f->time_next_packet > now) {
+ q->time_next_delayed_flow = f->time_next_packet;
+ break;
+ }
+ rb_erase(p, &q->delayed);
+ q->throttled_flows--;
+ fq_flow_add_tail(&q->old_flows, f);
+ }
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_to_ns(ktime_get());
+ struct fq_flow_head *head;
+ struct sk_buff *skb;
+ struct fq_flow *f;
+ u32 rate;
+
+ skb = fq_dequeue_head(sch, &q->internal);
+ if (skb)
+ goto out;
+ fq_check_throttled(q, now);
+begin:
+ head = &q->new_flows;
+ if (!head->first) {
+ head = &q->old_flows;
+ if (!head->first) {
+ if (q->time_next_delayed_flow != ~0ULL)
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ q->time_next_delayed_flow);
+ return NULL;
+ }
+ }
+ f = head->first;
+
+ if (f->credit <= 0) {
+ f->credit += q->quantum;
+ head->first = f->next;
+ fq_flow_add_tail(&q->old_flows, f);
+ goto begin;
+ }
+
+ if (unlikely(f->head && now < f->time_next_packet)) {
+ head->first = f->next;
+ fq_flow_set_throttled(q, f);
+ goto begin;
+ }
+
+ skb = fq_dequeue_head(sch, f);
+ if (!skb) {
+ head->first = f->next;
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && q->old_flows.first) {
+ fq_flow_add_tail(&q->old_flows, f);
+ } else {
+ fq_flow_set_detached(f);
+ q->inactive_flows++;
+ }
+ goto begin;
+ }
+ prefetch(&skb->end);
+ f->time_next_packet = now;
+ f->credit -= qdisc_pkt_len(skb);
+
+ if (f->credit > 0 || !q->rate_enable)
+ goto out;
+
+ rate = q->flow_max_rate;
+ if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
+ rate = min(skb->sk->sk_pacing_rate, rate);
+
+ if (rate != ~0U) {
+ u32 plen = max(qdisc_pkt_len(skb), q->quantum);
+ u64 len = (u64)plen * NSEC_PER_SEC;
+
+ if (likely(rate))
+ do_div(len, rate);
+ /* Since socket rate can change later,
+ * clamp the delay to 125 ms.
+ * TODO: maybe segment the too big skb, as in commit
+ * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+ */
+ if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+ len = 125 * NSEC_PER_MSEC;
+ q->stat_pkts_too_long++;
+ }
+
+ f->time_next_packet = now + len;
+ }
+out:
+ qdisc_bstats_update(sch, skb);
+ qdisc_unthrottled(sch);
+ return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *root;
+ struct sk_buff *skb;
+ struct rb_node *p;
+ struct fq_flow *f;
+ unsigned int idx;
+
+ while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
+ kfree_skb(skb);
+
+ if (!q->fq_root)
+ return;
+
+ for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+ root = &q->fq_root[idx];
+ while ((p = rb_first(root)) != NULL) {
+ f = container_of(p, struct fq_flow, fq_node);
+ rb_erase(p, root);
+
+ while ((skb = fq_dequeue_head(sch, f)) != NULL)
+ kfree_skb(skb);
+
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+ }
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->flows = 0;
+ q->inactive_flows = 0;
+ q->throttled_flows = 0;
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+ struct rb_root *old_array, u32 old_log,
+ struct rb_root *new_array, u32 new_log)
+{
+ struct rb_node *op, **np, *parent;
+ struct rb_root *oroot, *nroot;
+ struct fq_flow *of, *nf;
+ int fcnt = 0;
+ u32 idx;
+
+ for (idx = 0; idx < (1U << old_log); idx++) {
+ oroot = &old_array[idx];
+ while ((op = rb_first(oroot)) != NULL) {
+ rb_erase(op, oroot);
+ of = container_of(op, struct fq_flow, fq_node);
+ if (fq_gc_candidate(of)) {
+ fcnt++;
+ kmem_cache_free(fq_flow_cachep, of);
+ continue;
+ }
+ nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+ np = &nroot->rb_node;
+ parent = NULL;
+ while (*np) {
+ parent = *np;
+
+ nf = container_of(parent, struct fq_flow, fq_node);
+ BUG_ON(nf->sk == of->sk);
+
+ if (nf->sk > of->sk)
+ np = &parent->rb_right;
+ else
+ np = &parent->rb_left;
+ }
+
+ rb_link_node(&of->fq_node, parent, np);
+ rb_insert_color(&of->fq_node, nroot);
+ }
+ }
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+}
+
+static int fq_resize(struct fq_sched_data *q, u32 log)
+{
+ struct rb_root *array;
+ u32 idx;
+
+ if (q->fq_root && log == q->fq_trees_log)
+ return 0;
+
+ array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ for (idx = 0; idx < (1U << log); idx++)
+ array[idx] = RB_ROOT;
+
+ if (q->fq_root) {
+ fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
+ kfree(q->fq_root);
+ }
+ q->fq_root = array;
+ q->fq_trees_log = log;
+
+ return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_MAX + 1];
+ int err, drop_count = 0;
+ u32 fq_log;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ fq_log = q->fq_trees_log;
+
+ if (tb[TCA_FQ_BUCKETS_LOG]) {
+ u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+ if (nval >= 1 && nval <= ilog2(256*1024))
+ fq_log = nval;
+ else
+ err = -EINVAL;
+ }
+ if (tb[TCA_FQ_PLIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+ if (tb[TCA_FQ_FLOW_PLIMIT])
+ q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+ if (tb[TCA_FQ_QUANTUM])
+ q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+ if (tb[TCA_FQ_INITIAL_QUANTUM])
+ q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+ if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+ pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
+ nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
+
+ if (tb[TCA_FQ_FLOW_MAX_RATE])
+ q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+ if (tb[TCA_FQ_RATE_ENABLE]) {
+ u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+ if (enable <= 1)
+ q->rate_enable = enable;
+ else
+ err = -EINVAL;
+ }
+
+ if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
+ u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
+
+ q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+ }
+
+ if (!err)
+ err = fq_resize(q, fq_log);
+
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_dequeue(sch);
+
+ if (!skb)
+ break;
+ kfree_skb(skb);
+ drop_count++;
+ }
+ qdisc_tree_decrease_qlen(sch, drop_count);
+
+ sch_tree_unlock(sch);
+ return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+
+ fq_reset(sch);
+ kfree(q->fq_root);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ sch->limit = 10000;
+ q->flow_plimit = 100;
+ q->quantum = 2 * psched_mtu(qdisc_dev(sch));
+ q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
+ q->flow_refill_delay = msecs_to_jiffies(40);
+ q->flow_max_rate = ~0U;
+ q->rate_enable = 1;
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->fq_root = NULL;
+ q->fq_trees_log = ilog2(1024);
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt)
+ err = fq_change(sch, opt);
+ else
+ err = fq_resize(q, q->fq_trees_log);
+
+ return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+
+ if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+ nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+ nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
+ jiffies_to_usecs(q->flow_refill_delay)) ||
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, opts);
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_to_ns(ktime_get());
+ struct tc_fq_qd_stats st = {
+ .gc_flows = q->stat_gc_flows,
+ .highprio_packets = q->stat_internal_packets,
+ .tcp_retrans = q->stat_tcp_retrans,
+ .throttled = q->stat_throttled,
+ .flows_plimit = q->stat_flows_plimit,
+ .pkts_too_long = q->stat_pkts_too_long,
+ .allocation_errors = q->stat_allocation_errors,
+ .flows = q->flows,
+ .inactive_flows = q->inactive_flows,
+ .throttled_flows = q->throttled_flows,
+ .time_next_delayed_flow = q->time_next_delayed_flow - now,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+ .id = "fq",
+ .priv_size = sizeof(struct fq_sched_data),
+
+ .enqueue = fq_enqueue,
+ .dequeue = fq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_init,
+ .reset = fq_reset,
+ .destroy = fq_destroy,
+ .change = fq_change,
+ .dump = fq_dump,
+ .dump_stats = fq_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+ int ret;
+
+ fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+ sizeof(struct fq_flow),
+ 0, 0, NULL);
+ if (!fq_flow_cachep)
+ return -ENOMEM;
+
+ ret = register_qdisc(&fq_qdisc_ops);
+ if (ret)
+ kmem_cache_destroy(fq_flow_cachep);
+ return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+ unregister_qdisc(&fq_qdisc_ops);
+ kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cc54c80..819c7a3 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -25,10 +25,15 @@
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/if_vlan.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
/* Main transmission queue. */
/* Modifications to data participating in scheduling must be protected with
@@ -121,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq))
- ret = dev_hard_start_xmit(skb, dev, txq);
+ ret = dev_hard_start_xmit(skb, dev, txq, NULL);
HARD_TX_UNLOCK(dev, txq);
@@ -207,15 +212,19 @@ void __qdisc_run(struct Qdisc *q)
unsigned long dev_trans_start(struct net_device *dev)
{
- unsigned long val, res = dev->trans_start;
+ unsigned long val, res;
unsigned int i;
+ if (is_vlan_dev(dev))
+ dev = vlan_dev_real_dev(dev);
+ res = dev->trans_start;
for (i = 0; i < dev->num_tx_queues; i++) {
val = netdev_get_tx_queue(dev, i)->trans_start;
if (val && time_after(val, res))
res = val;
}
dev->trans_start = res;
+
return res;
}
EXPORT_SYMBOL(dev_trans_start);
@@ -540,12 +549,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};
-EXPORT_SYMBOL(pfifo_fast_ops);
static struct lock_class_key qdisc_tx_busylock;
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
- struct Qdisc_ops *ops)
+ const struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
@@ -589,10 +597,14 @@ errout:
}
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
- struct Qdisc_ops *ops, unsigned int parentid)
+ const struct Qdisc_ops *ops,
+ unsigned int parentid)
{
struct Qdisc *sch;
+ if (!try_module_get(ops->owner))
+ goto errout;
+
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch))
goto errout;
@@ -696,7 +708,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev_queue,
- &pfifo_fast_ops, TC_H_ROOT);
+ default_qdisc_ops, TC_H_ROOT);
if (!qdisc) {
netdev_info(dev, "activation failed\n");
return;
@@ -749,9 +761,8 @@ void dev_activate(struct net_device *dev)
int need_watchdog;
/* No queueing discipline is attached to device;
- create default one i.e. pfifo_fast for devices,
- which need queueing and noqueue_qdisc for
- virtual interfaces
+ * create default one for devices, which need queueing
+ * and noqueue_qdisc for virtual interfaces
*/
if (dev->qdisc == &noop_qdisc)
@@ -833,7 +844,7 @@ void dev_deactivate_many(struct list_head *head)
struct net_device *dev;
bool sync_needed = false;
- list_for_each_entry(dev, head, unreg_list) {
+ list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
&noop_qdisc);
if (dev_ingress_queue(dev))
@@ -852,7 +863,7 @@ void dev_deactivate_many(struct list_head *head)
synchronize_net();
/* Wait for outstanding qdisc_run calls. */
- list_for_each_entry(dev, head, unreg_list)
+ list_for_each_entry(dev, head, close_list)
while (some_qdisc_is_busy(dev))
yield();
}
@@ -861,7 +872,7 @@ void dev_deactivate(struct net_device *dev)
{
LIST_HEAD(single);
- list_add(&dev->unreg_list, &single);
+ list_add(&dev->close_list, &single);
dev_deactivate_many(&single);
list_del(&single);
}
@@ -914,39 +925,37 @@ void dev_shutdown(struct net_device *dev)
}
void psched_ratecfg_precompute(struct psched_ratecfg *r,
- const struct tc_ratespec *conf)
+ const struct tc_ratespec *conf,
+ u64 rate64)
{
- u64 factor;
- u64 mult;
- int shift;
-
memset(r, 0, sizeof(*r));
r->overhead = conf->overhead;
- r->rate_bps = (u64)conf->rate << 3;
+ r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
+ r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
r->mult = 1;
/*
- * Calibrate mult, shift so that token counting is accurate
- * for smallest packet size (64 bytes). Token (time in ns) is
- * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will
- * work as long as the smallest packet transfer time can be
- * accurately represented in nanosec.
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ * time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
*/
- if (r->rate_bps > 0) {
- /*
- * Higher shift gives better accuracy. Find the largest
- * shift such that mult fits in 32 bits.
- */
- for (shift = 0; shift < 16; shift++) {
- r->shift = shift;
- factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
- mult = div64_u64(factor, r->rate_bps);
- if (mult > UINT_MAX)
+ if (r->rate_bytes_ps > 0) {
+ u64 factor = NSEC_PER_SEC;
+
+ for (;;) {
+ r->mult = div64_u64(factor, r->rate_bytes_ps);
+ if (r->mult & (1U << 31) || factor & (1ULL << 63))
break;
+ factor <<= 1;
+ r->shift++;
}
-
- r->shift = shift - 1;
- factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
- r->mult = div64_u64(factor, r->rate_bps);
}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 9facea0..c407561 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
unsigned int level; /* class level in hierarchy */
struct tcf_proto *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index adaedd7..0e1e38b 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -65,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f
module_param (htb_hysteresis, int, 0640);
MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+static int htb_rate_est = 0; /* htb classes have a default rate estimator */
+module_param(htb_rate_est, int, 0640);
+MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
+
/* used internaly to keep status of single class */
enum htb_cmode {
HTB_CANT_SEND, /* class can't send and can't borrow */
@@ -72,95 +76,105 @@ enum htb_cmode {
HTB_CAN_SEND /* class can send */
};
-/* interior & leaf nodes; props specific to leaves are marked L: */
+struct htb_prio {
+ union {
+ struct rb_root row;
+ struct rb_root feed;
+ };
+ struct rb_node *ptr;
+ /* When class changes from state 1->2 and disconnects from
+ * parent's feed then we lost ptr value and start from the
+ * first child again. Here we store classid of the
+ * last valid ptr (used when ptr is NULL).
+ */
+ u32 last_ptr_id;
+};
+
+/* interior & leaf nodes; props specific to leaves are marked L:
+ * To reduce false sharing, place mostly read fields at beginning,
+ * and mostly written ones at the end.
+ */
struct htb_class {
struct Qdisc_class_common common;
- /* general class parameters */
- struct gnet_stats_basic_packed bstats;
- struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
- struct tc_htb_xstats xstats; /* our special stats */
- int refcnt; /* usage count of this class */
+ struct psched_ratecfg rate;
+ struct psched_ratecfg ceil;
+ s64 buffer, cbuffer;/* token bucket depth/rate */
+ s64 mbuffer; /* max wait time */
+ u32 prio; /* these two are used only by leaves... */
+ int quantum; /* but stored for parent-to-leaf return */
+
+ struct tcf_proto *filter_list; /* class attached filters */
+ int filter_cnt;
+ int refcnt; /* usage count of this class */
+
+ int level; /* our level (see above) */
+ unsigned int children;
+ struct htb_class *parent; /* parent class */
+
+ struct gnet_stats_rate_est64 rate_est;
- /* topology */
- int level; /* our level (see above) */
- unsigned int children;
- struct htb_class *parent; /* parent class */
+ /*
+ * Written often fields
+ */
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct tc_htb_xstats xstats; /* our special stats */
- int prio; /* these two are used only by leaves... */
- int quantum; /* but stored for parent-to-leaf return */
+ /* token bucket parameters */
+ s64 tokens, ctokens;/* current number of tokens */
+ s64 t_c; /* checkpoint time */
union {
struct htb_class_leaf {
- struct Qdisc *q;
- int deficit[TC_HTB_MAXDEPTH];
struct list_head drop_list;
+ int deficit[TC_HTB_MAXDEPTH];
+ struct Qdisc *q;
} leaf;
struct htb_class_inner {
- struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
- struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
- /* When class changes from state 1->2 and disconnects from
- * parent's feed then we lost ptr value and start from the
- * first child again. Here we store classid of the
- * last valid ptr (used when ptr is NULL).
- */
- u32 last_ptr_id[TC_HTB_NUMPRIO];
+ struct htb_prio clprio[TC_HTB_NUMPRIO];
} inner;
} un;
- struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
- struct rb_node pq_node; /* node for event queue */
- s64 pq_key;
+ s64 pq_key;
- int prio_activity; /* for which prios are we active */
- enum htb_cmode cmode; /* current mode of the class */
-
- /* class attached filters */
- struct tcf_proto *filter_list;
- int filter_cnt;
+ int prio_activity; /* for which prios are we active */
+ enum htb_cmode cmode; /* current mode of the class */
+ struct rb_node pq_node; /* node for event queue */
+ struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+};
- /* token bucket parameters */
- struct psched_ratecfg rate;
- struct psched_ratecfg ceil;
- s64 buffer, cbuffer; /* token bucket depth/rate */
- s64 mbuffer; /* max wait time */
- s64 tokens, ctokens; /* current number of tokens */
- s64 t_c; /* checkpoint time */
+struct htb_level {
+ struct rb_root wait_pq;
+ struct htb_prio hprio[TC_HTB_NUMPRIO];
};
struct htb_sched {
struct Qdisc_class_hash clhash;
- struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
+ int defcls; /* class where unclassified flows go to */
+ int rate2quantum; /* quant = rate / rate2quantum */
- /* self list - roots of self generating tree */
- struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
- int row_mask[TC_HTB_MAXDEPTH];
- struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
- u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
-
- /* self wait list - roots of wait PQs per row */
- struct rb_root wait_pq[TC_HTB_MAXDEPTH];
+ /* filters for qdisc itself */
+ struct tcf_proto *filter_list;
- /* time of nearest event per level (row) */
- s64 near_ev_cache[TC_HTB_MAXDEPTH];
+#define HTB_WARN_TOOMANYEVENTS 0x1
+ unsigned int warned; /* only one warning */
+ int direct_qlen;
+ struct work_struct work;
- int defcls; /* class where unclassified flows go to */
+ /* non shaped skbs; let them go directly thru */
+ struct sk_buff_head direct_queue;
+ long direct_pkts;
- /* filters for qdisc itself */
- struct tcf_proto *filter_list;
+ struct qdisc_watchdog watchdog;
- int rate2quantum; /* quant = rate / rate2quantum */
- s64 now; /* cached dequeue time */
- struct qdisc_watchdog watchdog;
+ s64 now; /* cached dequeue time */
+ struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
- /* non shaped skbs; let them go directly thru */
- struct sk_buff_head direct_queue;
- int direct_qlen; /* max qlen of above */
+ /* time of nearest event per level (row) */
+ s64 near_ev_cache[TC_HTB_MAXDEPTH];
- long direct_pkts;
+ int row_mask[TC_HTB_MAXDEPTH];
-#define HTB_WARN_TOOMANYEVENTS 0x1
- unsigned int warned; /* only one warning */
- struct work_struct work;
+ struct htb_level hlevel[TC_HTB_MAXDEPTH];
};
/* find class in global hash table using given handle */
@@ -276,7 +290,7 @@ static void htb_add_to_id_tree(struct rb_root *root,
static void htb_add_to_wait_tree(struct htb_sched *q,
struct htb_class *cl, s64 delay)
{
- struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
+ struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
cl->pq_key = q->now + delay;
if (cl->pq_key == q->now)
@@ -296,7 +310,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q,
p = &parent->rb_left;
}
rb_link_node(&cl->pq_node, parent, p);
- rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
+ rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
}
/**
@@ -323,7 +337,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q,
while (mask) {
int prio = ffz(~mask);
mask &= ~(1 << prio);
- htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
+ htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
}
}
@@ -349,16 +363,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q,
struct htb_class *cl, int mask)
{
int m = 0;
+ struct htb_level *hlevel = &q->hlevel[cl->level];
while (mask) {
int prio = ffz(~mask);
+ struct htb_prio *hprio = &hlevel->hprio[prio];
mask &= ~(1 << prio);
- if (q->ptr[cl->level][prio] == cl->node + prio)
- htb_next_rb_node(q->ptr[cl->level] + prio);
+ if (hprio->ptr == cl->node + prio)
+ htb_next_rb_node(&hprio->ptr);
- htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
- if (!q->row[cl->level][prio].rb_node)
+ htb_safe_rb_erase(cl->node + prio, &hprio->row);
+ if (!hprio->row.rb_node)
m |= 1 << prio;
}
q->row_mask[cl->level] &= ~m;
@@ -382,13 +398,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.feed[prio].rb_node)
+ if (p->un.inner.clprio[prio].feed.rb_node)
/* parent already has its feed in use so that
* reset bit in mask as parent is already ok
*/
mask &= ~(1 << prio);
- htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
+ htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
}
p->prio_activity |= mask;
cl = p;
@@ -418,18 +434,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.ptr[prio] == cl->node + prio) {
+ if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
/* we are removing child which is pointed to from
* parent feed - forget the pointer but remember
* classid
*/
- p->un.inner.last_ptr_id[prio] = cl->common.classid;
- p->un.inner.ptr[prio] = NULL;
+ p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
+ p->un.inner.clprio[prio].ptr = NULL;
}
- htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
+ htb_safe_rb_erase(cl->node + prio,
+ &p->un.inner.clprio[prio].feed);
- if (!p->un.inner.feed[prio].rb_node)
+ if (!p->un.inner.clprio[prio].feed.rb_node)
mask |= 1 << prio;
}
@@ -644,7 +661,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
htb_change_class_mode(q, cl, &diff);
if (old_mode != cl->cmode) {
if (old_mode != HTB_CAN_SEND)
- htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+ htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
if (cl->cmode != HTB_CAN_SEND)
htb_add_to_wait_tree(q, cl, diff);
}
@@ -664,7 +681,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
* next pending event (0 for no event in pq, q->now for too many events).
* Note: Applied are events whose have cl->pq_key <= q->now.
*/
-static s64 htb_do_events(struct htb_sched *q, int level,
+static s64 htb_do_events(struct htb_sched *q, const int level,
unsigned long start)
{
/* don't run for longer than 2 jiffies; 2 is used instead of
@@ -672,10 +689,12 @@ static s64 htb_do_events(struct htb_sched *q, int level,
* too soon
*/
unsigned long stop_at = start + 2;
+ struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
+
while (time_before(jiffies, stop_at)) {
struct htb_class *cl;
s64 diff;
- struct rb_node *p = rb_first(&q->wait_pq[level]);
+ struct rb_node *p = rb_first(wait_pq);
if (!p)
return 0;
@@ -684,7 +703,7 @@ static s64 htb_do_events(struct htb_sched *q, int level,
if (cl->pq_key > q->now)
return cl->pq_key;
- htb_safe_rb_erase(p, q->wait_pq + level);
+ htb_safe_rb_erase(p, wait_pq);
diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
htb_change_class_mode(q, cl, &diff);
if (cl->cmode != HTB_CAN_SEND)
@@ -728,8 +747,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
*
* Find leaf where current feed pointers points to.
*/
-static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
- struct rb_node **pptr, u32 * pid)
+static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
{
int i;
struct {
@@ -738,10 +756,10 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
u32 *pid;
} stk[TC_HTB_MAXDEPTH], *sp = stk;
- BUG_ON(!tree->rb_node);
- sp->root = tree->rb_node;
- sp->pptr = pptr;
- sp->pid = pid;
+ BUG_ON(!hprio->row.rb_node);
+ sp->root = hprio->row.rb_node;
+ sp->pptr = &hprio->ptr;
+ sp->pid = &hprio->last_ptr_id;
for (i = 0; i < 65535; i++) {
if (!*sp->pptr && *sp->pid) {
@@ -768,12 +786,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
}
} else {
struct htb_class *cl;
+ struct htb_prio *clp;
+
cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
if (!cl->level)
return cl;
- (++sp)->root = cl->un.inner.feed[prio].rb_node;
- sp->pptr = cl->un.inner.ptr + prio;
- sp->pid = cl->un.inner.last_ptr_id + prio;
+ clp = &cl->un.inner.clprio[prio];
+ (++sp)->root = clp->feed.rb_node;
+ sp->pptr = &clp->ptr;
+ sp->pid = &clp->last_ptr_id;
}
}
WARN_ON(1);
@@ -783,15 +804,16 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
/* dequeues packet at given priority and level; call only if
* you are sure that there is active class at prio/level
*/
-static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
- int level)
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
+ const int level)
{
struct sk_buff *skb = NULL;
struct htb_class *cl, *start;
+ struct htb_level *hlevel = &q->hlevel[level];
+ struct htb_prio *hprio = &hlevel->hprio[prio];
+
/* look initial class up in the row */
- start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
- q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ start = cl = htb_lookup_leaf(hprio, prio);
do {
next:
@@ -811,9 +833,7 @@ next:
if ((q->row_mask[level] & (1 << prio)) == 0)
return NULL;
- next = htb_lookup_leaf(q->row[level] + prio,
- prio, q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ next = htb_lookup_leaf(hprio, prio);
if (cl == start) /* fix start if we just deleted it */
start = next;
@@ -826,11 +846,9 @@ next:
break;
qdisc_warn_nonwc("htb", cl->un.leaf.q);
- htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
- ptr[0]) + prio);
- cl = htb_lookup_leaf(q->row[level] + prio, prio,
- q->ptr[level] + prio,
- q->last_ptr_id[level] + prio);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+ &q->hlevel[0].hprio[prio].ptr);
+ cl = htb_lookup_leaf(hprio, prio);
} while (cl != start);
@@ -839,8 +857,8 @@ next:
cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
if (cl->un.leaf.deficit[level] < 0) {
cl->un.leaf.deficit[level] += cl->quantum;
- htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
- ptr[0]) + prio);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+ &q->hlevel[0].hprio[prio].ptr);
}
/* this used to be after charge_class but this constelation
* gives us slightly better performance
@@ -880,15 +898,14 @@ ok:
for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
/* common case optimization - skip event handler quickly */
int m;
- s64 event;
+ s64 event = q->near_ev_cache[level];
- if (q->now >= q->near_ev_cache[level]) {
+ if (q->now >= event) {
event = htb_do_events(q, level, start_at);
if (!event)
event = q->now + NSEC_PER_SEC;
q->near_ev_cache[level] = event;
- } else
- event = q->near_ev_cache[level];
+ }
if (next_event > event)
next_event = event;
@@ -968,10 +985,8 @@ static void htb_reset(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
__skb_queue_purge(&q->direct_queue);
sch->q.qlen = 0;
- memset(q->row, 0, sizeof(q->row));
+ memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
- memset(q->wait_pq, 0, sizeof(q->wait_pq));
- memset(q->ptr, 0, sizeof(q->ptr));
for (i = 0; i < TC_HTB_NUMPRIO; i++)
INIT_LIST_HEAD(q->drops + i);
}
@@ -982,6 +997,8 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
[TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
+ [TCA_HTB_RATE64] = { .type = NLA_U64 },
+ [TCA_HTB_CEIL64] = { .type = NLA_U64 },
};
static void htb_work_func(struct work_struct *work)
@@ -1099,6 +1116,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
opt.level = cl->level;
if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
+ if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+ goto nla_put_failure;
+ if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
spin_unlock_bh(root_lock);
@@ -1192,7 +1215,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
if (parent->cmode != HTB_CAN_SEND)
- htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
+ htb_safe_rb_erase(&parent->pq_node,
+ &q->hlevel[parent->level].wait_pq);
parent->level = 0;
memset(&parent->un.inner, 0, sizeof(parent->un.inner));
@@ -1281,7 +1305,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
htb_deactivate(q, cl);
if (cl->cmode != HTB_CAN_SEND)
- htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+ htb_safe_rb_erase(&cl->pq_node,
+ &q->hlevel[cl->level].wait_pq);
if (last_child)
htb_parent_to_leaf(q, cl, new_q);
@@ -1312,8 +1337,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = (struct htb_class *)*arg, *parent;
struct nlattr *opt = tca[TCA_OPTIONS];
+ struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_opt *hopt;
+ u64 rate64, ceil64;
/* extract all subattrs from opt attr */
if (!opt)
@@ -1333,6 +1360,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (!hopt->rate.rate || !hopt->ceil.rate)
goto failure;
+ /* Keeping backward compatible with rate_table based iproute2 tc */
+ if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) {
+ rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
+ if (rtab)
+ qdisc_put_rtab(rtab);
+ }
+ if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) {
+ ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
+ if (ctab)
+ qdisc_put_rtab(ctab);
+ }
+
if (!cl) { /* new class */
struct Qdisc *new_q;
int prio;
@@ -1366,12 +1405,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (!cl)
goto failure;
- err = gen_new_estimator(&cl->bstats, &cl->rate_est,
- qdisc_root_sleeping_lock(sch),
- tca[TCA_RATE] ? : &est.nla);
- if (err) {
- kfree(cl);
- goto failure;
+ if (htb_rate_est || tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE] ? : &est.nla);
+ if (err) {
+ kfree(cl);
+ goto failure;
+ }
}
cl->refcnt = 1;
@@ -1401,7 +1442,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* remove from evt list because of level change */
if (parent->cmode != HTB_CAN_SEND) {
- htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
+ htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
parent->cmode = HTB_CAN_SEND;
}
parent->level = (parent->parent ? parent->parent->level
@@ -1459,11 +1500,15 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
cl->prio = TC_HTB_NUMPRIO - 1;
}
- psched_ratecfg_precompute(&cl->rate, &hopt->rate);
- psched_ratecfg_precompute(&cl->ceil, &hopt->ceil);
+ rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+ ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+ psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+ psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
- cl->cbuffer = PSCHED_TICKS2NS(hopt->buffer);
+ cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
sch_tree_unlock(sch);
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 5da78a1..2e56185 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
- qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index accec33..d44c868 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
for (i = 0; i < dev->num_tx_queues; i++) {
dev_queue = netdev_get_tx_queue(dev, i);
- qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(i + 1)));
if (qdisc == NULL) {
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 3d2acc7..bccd52b 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -23,6 +23,7 @@
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -68,7 +69,8 @@
*/
struct netem_sched_data {
- /* internal t(ime)fifo qdisc uses sch->q and sch->limit */
+ /* internal t(ime)fifo qdisc uses t_root and sch->limit */
+ struct rb_root t_root;
/* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
@@ -128,10 +130,35 @@ struct netem_sched_data {
*/
struct netem_skb_cb {
psched_time_t time_to_send;
+ ktime_t tstamp_save;
};
+/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
+ * to hold a rb_node structure.
+ *
+ * If struct sk_buff layout is changed, the following checks will complain.
+ */
+static struct rb_node *netem_rb_node(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
+ BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
+ offsetof(struct sk_buff, next) + sizeof(skb->next));
+ BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
+ offsetof(struct sk_buff, prev) + sizeof(skb->prev));
+ BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
+ sizeof(skb->prev) +
+ sizeof(skb->tstamp));
+ return (struct rb_node *)&skb->next;
+}
+
+static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
+{
+ return (struct sk_buff *)rb;
+}
+
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
{
+ /* we assume we can use skb next/prev/tstamp as storage for rb_node */
qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
}
@@ -188,10 +215,10 @@ static bool loss_4state(struct netem_sched_data *q)
if (rnd < clg->a4) {
clg->state = 4;
return true;
- } else if (clg->a4 < rnd && rnd < clg->a1) {
+ } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
clg->state = 3;
return true;
- } else if (clg->a1 < rnd)
+ } else if (clg->a1 + clg->a4 < rnd)
clg->state = 1;
break;
@@ -208,7 +235,6 @@ static bool loss_4state(struct netem_sched_data *q)
clg->state = 2;
else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
clg->state = 1;
- return true;
} else if (clg->a2 + clg->a3 < rnd) {
clg->state = 3;
return true;
@@ -242,10 +268,11 @@ static bool loss_gilb_ell(struct netem_sched_data *q)
clg->state = 2;
if (net_random() < clg->a4)
return true;
+ break;
case 2:
if (net_random() < clg->a2)
clg->state = 1;
- if (clg->a3 > net_random())
+ if (net_random() > clg->a3)
return true;
}
@@ -331,22 +358,40 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
return PSCHED_NS2TICKS(ticks);
}
+static void tfifo_reset(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ while ((p = rb_first(&q->t_root))) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ skb->next = NULL;
+ skb->prev = NULL;
+ kfree_skb(skb);
+ }
+}
+
static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
{
- struct sk_buff_head *list = &sch->q;
+ struct netem_sched_data *q = qdisc_priv(sch);
psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
- struct sk_buff *skb = skb_peek_tail(list);
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
- /* Optimize for add at tail */
- if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
- return __skb_queue_tail(list, nskb);
+ while (*p) {
+ struct sk_buff *skb;
- skb_queue_reverse_walk(list, skb) {
+ parent = *p;
+ skb = netem_rb_to_skb(parent);
if (tnext >= netem_skb_cb(skb)->time_to_send)
- break;
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
}
-
- __skb_queue_after(list, skb, nskb);
+ rb_link_node(netem_rb_node(nskb), parent, p);
+ rb_insert_color(netem_rb_node(nskb), &q->t_root);
+ sch->q.qlen++;
}
/*
@@ -382,12 +427,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
/* If a delay is expected, orphan the skb. (orphaning usually takes
* place at TX completion time, so _before_ the link transit delay)
- * Ideally, this orphaning should be done after the rate limiting
- * module, because this breaks TCP Small Queue, and other mechanisms
- * based on socket sk_wmem_alloc.
*/
if (q->latency || q->jitter)
- skb_orphan(skb);
+ skb_orphan_partial(skb);
/*
* If we need to duplicate packet, then re-insert at top of the
@@ -436,23 +478,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
now = psched_get_time();
if (q->rate) {
- struct sk_buff_head *list = &sch->q;
+ struct sk_buff *last;
- if (!skb_queue_empty(list)) {
+ if (!skb_queue_empty(&sch->q))
+ last = skb_peek_tail(&sch->q);
+ else
+ last = netem_rb_to_skb(rb_last(&q->t_root));
+ if (last) {
/*
* Last packet in queue is reference point (now),
* calculate this time bonus and subtract
* from delay.
*/
- delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now;
+ delay -= netem_skb_cb(last)->time_to_send - now;
delay = max_t(psched_tdiff_t, 0, delay);
- now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
+ now = netem_skb_cb(last)->time_to_send;
}
delay += packet_len_2_sched_time(skb->len, q);
}
cb->time_to_send = now + delay;
+ cb->tstamp_save = skb->tstamp;
++q->counter;
tfifo_enqueue(skb, sch);
} else {
@@ -476,6 +523,22 @@ static unsigned int netem_drop(struct Qdisc *sch)
unsigned int len;
len = qdisc_queue_drop(sch);
+
+ if (!len) {
+ struct rb_node *p = rb_first(&q->t_root);
+
+ if (p) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ len = qdisc_pkt_len(skb);
+ sch->qstats.backlog -= len;
+ kfree_skb(skb);
+ }
+ }
if (!len && q->qdisc && q->qdisc->ops->drop)
len = q->qdisc->ops->drop(q->qdisc);
if (len)
@@ -488,19 +551,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
+ struct rb_node *p;
if (qdisc_is_throttled(sch))
return NULL;
tfifo_dequeue:
- skb = qdisc_peek_head(sch);
+ skb = __skb_dequeue(&sch->q);
if (skb) {
- const struct netem_skb_cb *cb = netem_skb_cb(skb);
+deliver:
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+ p = rb_first(&q->t_root);
+ if (p) {
+ psched_time_t time_to_send;
+
+ skb = netem_rb_to_skb(p);
/* if more time remaining? */
- if (cb->time_to_send <= psched_get_time()) {
- __skb_unlink(skb, &sch->q);
- sch->qstats.backlog -= qdisc_pkt_len(skb);
+ time_to_send = netem_skb_cb(skb)->time_to_send;
+ if (time_to_send <= psched_get_time()) {
+ rb_erase(p, &q->t_root);
+
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->tstamp = netem_skb_cb(skb)->tstamp_save;
#ifdef CONFIG_NET_CLS_ACT
/*
@@ -522,10 +601,7 @@ tfifo_dequeue:
}
goto tfifo_dequeue;
}
-deliver:
- qdisc_unthrottled(sch);
- qdisc_bstats_update(sch, skb);
- return skb;
+ goto deliver;
}
if (q->qdisc) {
@@ -533,7 +609,7 @@ deliver:
if (skb)
goto deliver;
}
- qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
+ qdisc_watchdog_schedule(&q->watchdog, time_to_send);
}
if (q->qdisc) {
@@ -549,6 +625,7 @@ static void netem_reset(struct Qdisc *sch)
struct netem_sched_data *q = qdisc_priv(sch);
qdisc_reset_queue(sch);
+ tfifo_reset(sch);
if (q->qdisc)
qdisc_reset(q->qdisc);
qdisc_watchdog_cancel(&q->watchdog);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index d51852b..8056fb4 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -113,7 +113,6 @@
#define FRAC_BITS 30 /* fixed point arithmetic */
#define ONE_FP (1UL << FRAC_BITS)
-#define IWSUM (ONE_FP/QFQ_MAX_WSUM)
#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
@@ -138,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est rate_est;
+ struct gnet_stats_rate_est64 rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -189,6 +188,7 @@ struct qfq_sched {
struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
u32 num_active_agg; /* Num. of active aggregates */
u32 wsum; /* weight sum */
+ u32 iwsum; /* inverse weight sum */
unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
@@ -314,6 +314,7 @@ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
q->wsum +=
(int) agg->class_weight * (new_num_classes - agg->num_classes);
+ q->iwsum = ONE_FP / q->wsum;
agg->num_classes = new_num_classes;
}
@@ -340,6 +341,10 @@ static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
{
if (!hlist_unhashed(&agg->nonfull_next))
hlist_del_init(&agg->nonfull_next);
+ q->wsum -= agg->class_weight;
+ if (q->wsum != 0)
+ q->iwsum = ONE_FP / q->wsum;
+
if (q->in_serv_agg == agg)
q->in_serv_agg = qfq_choose_next_agg(q);
kfree(agg);
@@ -821,44 +826,73 @@ static void qfq_make_eligible(struct qfq_sched *q)
unsigned long old_vslot = q->oldV >> q->min_slot_shift;
if (vslot != old_vslot) {
- unsigned long mask = (1ULL << fls(vslot ^ old_vslot)) - 1;
+ unsigned long mask;
+ int last_flip_pos = fls(vslot ^ old_vslot);
+
+ if (last_flip_pos > 31) /* higher than the number of groups */
+ mask = ~0UL; /* make all groups eligible */
+ else
+ mask = (1UL << last_flip_pos) - 1;
+
qfq_move_groups(q, mask, IR, ER);
qfq_move_groups(q, mask, IB, EB);
}
}
-
/*
- * The index of the slot in which the aggregate is to be inserted must
- * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1'
- * because the start time of the group may be moved backward by one
- * slot after the aggregate has been inserted, and this would cause
- * non-empty slots to be right-shifted by one position.
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
*
- * If the weight and lmax (max_pkt_size) of the classes do not change,
- * then QFQ+ does meet the above contraint according to the current
- * values of its parameters. In fact, if the weight and lmax of the
- * classes do not change, then, from the theory, QFQ+ guarantees that
- * the slot index is never higher than
- * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
- * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
*
- * When the weight of a class is increased or the lmax of the class is
- * decreased, a new aggregate with smaller slot size than the original
- * parent aggregate of the class may happen to be activated. The
- * activation of this aggregate should be properly delayed to when the
- * service of the class has finished in the ideal system tracked by
- * QFQ+. If the activation of the aggregate is not delayed to this
- * reference time instant, then this aggregate may be unjustly served
- * before other aggregates waiting for service. This may cause the
- * above bound to the slot index to be violated for some of these
- * unlucky aggregates.
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated. In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
*
* Instead of delaying the activation of the new aggregate, which is
- * quite complex, the following inaccurate but simple solution is used:
- * if the slot index is higher than QFQ_MAX_SLOTS-2, then the
- * timestamps of the aggregate are shifted backward so as to let the
- * slot index become equal to QFQ_MAX_SLOTS-2.
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
*/
static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
u64 roundedS)
@@ -1003,9 +1037,61 @@ static inline void charge_actual_service(struct qfq_aggregate *agg)
agg->F = agg->S + (u64)service_received * agg->inv_w;
}
-static inline void qfq_update_agg_ts(struct qfq_sched *q,
- struct qfq_aggregate *agg,
- enum update_reason reason);
+/* Assign a reasonable start time for a new aggregate in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in EB (see [2]). So, if we have groups in ER,
+ * set S to the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ unsigned long mask;
+ u64 limit, roundedF;
+ int slot_shift = agg->grp->slot_shift;
+
+ roundedF = qfq_round_down(agg->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
+
+ if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], agg->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ if (qfq_gt(limit, next->F))
+ agg->S = next->F;
+ else /* preserve timestamp correctness */
+ agg->S = limit;
+ return;
+ }
+ }
+ agg->S = q->V;
+ } else /* timestamp is not stale */
+ agg->S = agg->F;
+}
+
+/* Update the timestamps of agg before scheduling/rescheduling it for
+ * service. In particular, assign to agg->F its maximum possible
+ * value, i.e., the virtual finish time with which the aggregate
+ * should be labeled if it used all its budget once in service.
+ */
+static inline void
+qfq_update_agg_ts(struct qfq_sched *q,
+ struct qfq_aggregate *agg, enum update_reason reason)
+{
+ if (reason != requeue)
+ qfq_update_start(q, agg);
+ else /* just charge agg for the service received */
+ agg->S = agg->F;
+
+ agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
+}
static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
@@ -1077,7 +1163,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
else
in_serv_agg->budget -= len;
- q->V += (u64)len * IWSUM;
+ q->V += (u64)len * q->iwsum;
pr_debug("qfq dequeue: len %u F %lld now %lld\n",
len, (unsigned long long) in_serv_agg->F,
(unsigned long long) q->V);
@@ -1128,66 +1214,6 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
return agg;
}
-/*
- * Assign a reasonable start time for a new aggregate in group i.
- * Admissible values for \hat(F) are multiples of \sigma_i
- * no greater than V+\sigma_i . Larger values mean that
- * we had a wraparound so we consider the timestamp to be stale.
- *
- * If F is not stale and F >= V then we set S = F.
- * Otherwise we should assign S = V, but this may violate
- * the ordering in EB (see [2]). So, if we have groups in ER,
- * set S to the F_j of the first group j which would be blocking us.
- * We are guaranteed not to move S backward because
- * otherwise our group i would still be blocked.
- */
-static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
-{
- unsigned long mask;
- u64 limit, roundedF;
- int slot_shift = agg->grp->slot_shift;
-
- roundedF = qfq_round_down(agg->F, slot_shift);
- limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
-
- if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
- /* timestamp was stale */
- mask = mask_from(q->bitmaps[ER], agg->grp->index);
- if (mask) {
- struct qfq_group *next = qfq_ffs(q, mask);
- if (qfq_gt(roundedF, next->F)) {
- if (qfq_gt(limit, next->F))
- agg->S = next->F;
- else /* preserve timestamp correctness */
- agg->S = limit;
- return;
- }
- }
- agg->S = q->V;
- } else /* timestamp is not stale */
- agg->S = agg->F;
-}
-
-/*
- * Update the timestamps of agg before scheduling/rescheduling it for
- * service. In particular, assign to agg->F its maximum possible
- * value, i.e., the virtual finish time with which the aggregate
- * should be labeled if it used all its budget once in service.
- */
-static inline void
-qfq_update_agg_ts(struct qfq_sched *q,
- struct qfq_aggregate *agg, enum update_reason reason)
-{
- if (reason != requeue)
- qfq_update_start(q, agg);
- else /* just charge agg for the service received */
- agg->S = agg->F;
-
- agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
-}
-
-static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *);
-
static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct qfq_sched *q = qdisc_priv(sch);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 3811e8e..d166360 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -21,6 +21,7 @@
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
+#include <net/tcp.h>
/* Simple Token Bucket Filter.
@@ -129,14 +130,69 @@ struct tbf_sched_data {
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
+
+/*
+ * Return length of individual segments of a gso packet,
+ * including all headers (MAC, IP, TCP/UDP)
+ */
+static unsigned int skb_gso_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ hdr_len += tcp_hdrlen(skb);
+ else
+ hdr_len += sizeof(struct udphdr);
+ return hdr_len + shinfo->gso_size;
+}
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ int ret, nb;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_reshape_fail(skb, sch);
+
+ nb = 0;
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ ret = qdisc_enqueue(segs, q->qdisc);
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ sch->qstats.drops++;
+ } else {
+ nb++;
+ }
+ segs = nskb;
+ }
+ sch->q.qlen += nb;
+ if (nb > 1)
+ qdisc_tree_decrease_qlen(sch, 1 - nb);
+ consume_skb(skb);
+ return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
- if (qdisc_pkt_len(skb) > q->max_size)
+ if (qdisc_pkt_len(skb) > q->max_size) {
+ if (skb_is_gso(skb) && skb_gso_seglen(skb) <= q->max_size)
+ return tbf_segment(skb, sch);
return qdisc_reshape_fail(skb, sch);
-
+ }
ret = qdisc_enqueue(skb, q->qdisc);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
@@ -236,20 +292,23 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_TBF_RATE64] = { .type = NLA_U64 },
+ [TCA_TBF_PRATE64] = { .type = NLA_U64 },
};
static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
int err;
struct tbf_sched_data *q = qdisc_priv(sch);
- struct nlattr *tb[TCA_TBF_PTAB + 1];
+ struct nlattr *tb[TCA_TBF_MAX + 1];
struct tc_tbf_qopt *qopt;
struct qdisc_rate_table *rtab = NULL;
struct qdisc_rate_table *ptab = NULL;
struct Qdisc *child = NULL;
int max_size, n;
+ u64 rate64 = 0, prate64 = 0;
- err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
+ err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
if (err < 0)
return err;
@@ -286,6 +345,11 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
if (max_size < 0)
goto done;
+ if (max_size < psched_mtu(qdisc_dev(sch)))
+ pr_warn_ratelimited("sch_tbf: burst %u is lower than device %s mtu (%u) !\n",
+ max_size, qdisc_dev(sch)->name,
+ psched_mtu(qdisc_dev(sch)));
+
if (q->qdisc != &noop_qdisc) {
err = fifo_set_limit(q->qdisc, qopt->limit);
if (err)
@@ -311,9 +375,13 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
q->tokens = q->buffer;
q->ptokens = q->mtu;
- psched_ratecfg_precompute(&q->rate, &rtab->rate);
+ if (tb[TCA_TBF_RATE64])
+ rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
+ psched_ratecfg_precompute(&q->rate, &rtab->rate, rate64);
if (ptab) {
- psched_ratecfg_precompute(&q->peak, &ptab->rate);
+ if (tb[TCA_TBF_PRATE64])
+ prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
+ psched_ratecfg_precompute(&q->peak, &ptab->rate, prate64);
q->peak_present = true;
} else {
q->peak_present = false;
@@ -380,6 +448,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.buffer = PSCHED_NS2TICKS(q->buffer);
if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
+ if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+ goto nla_put_failure;
+ if (q->peak_present &&
+ q->peak.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+ goto nla_put_failure;
nla_nest_end(skb, nest);
return skb->len;